"...git@developer.sourcefind.cn:chenpangpang/open-webui.git" did not exist on "132d741c55c6bd10aa15b1280157a1cd54e97785"
Unverified Commit 850cf4af authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Compute `dropout_probability` only in training mode (#24486)



* fix

* fix

* fix

* fix

* fix

* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 9895670e
...@@ -807,8 +807,13 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel): ...@@ -807,8 +807,13 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
to_drop = False
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
if self.gradient_checkpointing and self.training: if self.gradient_checkpointing and self.training:
...@@ -1052,8 +1057,9 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel): ...@@ -1052,8 +1057,9 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if dropout_probability < self.layerdrop:
continue continue
past_key_value = past_key_values[idx] if past_key_values is not None else None past_key_value = past_key_values[idx] if past_key_values is not None else None
......
...@@ -661,8 +661,9 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel): ...@@ -661,8 +661,9 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if dropout_probability < self.layerdrop:
continue continue
past_key_value = past_key_values[idx] if past_key_values is not None else None past_key_value = past_key_values[idx] if past_key_values is not None else None
......
...@@ -919,8 +919,13 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel): ...@@ -919,8 +919,13 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
to_drop = False
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
# we add position_embeddings as extra input to the encoder_layer # we add position_embeddings as extra input to the encoder_layer
...@@ -1061,8 +1066,9 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel): ...@@ -1061,8 +1066,9 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if dropout_probability < self.layerdrop:
continue continue
if self.gradient_checkpointing and self.training: if self.gradient_checkpointing and self.training:
......
...@@ -936,8 +936,13 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel): ...@@ -936,8 +936,13 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
to_drop = False
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
if self.gradient_checkpointing and self.training: if self.gradient_checkpointing and self.training:
...@@ -1150,8 +1155,9 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel): ...@@ -1150,8 +1155,9 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if dropout_probability < self.layerdrop:
continue continue
past_key_value = past_key_values[idx] if past_key_values is not None else None past_key_value = past_key_values[idx] if past_key_values is not None else None
......
...@@ -693,8 +693,9 @@ class TrOCRDecoder(TrOCRPreTrainedModel): ...@@ -693,8 +693,9 @@ class TrOCRDecoder(TrOCRPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if dropout_probability < self.layerdrop:
continue continue
past_key_value = past_key_values[idx] if past_key_values is not None else None past_key_value = past_key_values[idx] if past_key_values is not None else None
......
...@@ -915,8 +915,13 @@ class WhisperEncoder(WhisperPreTrainedModel): ...@@ -915,8 +915,13 @@ class WhisperEncoder(WhisperPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
to_drop = False
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
if self.gradient_checkpointing and self.training: if self.gradient_checkpointing and self.training:
...@@ -1144,8 +1149,9 @@ class WhisperDecoder(WhisperPreTrainedModel): ...@@ -1144,8 +1149,9 @@ class WhisperDecoder(WhisperPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if dropout_probability < self.layerdrop:
continue continue
past_key_value = past_key_values[idx] if past_key_values is not None else None past_key_value = past_key_values[idx] if past_key_values is not None else None
......
...@@ -667,8 +667,9 @@ class XGLMModel(XGLMPreTrainedModel): ...@@ -667,8 +667,9 @@ class XGLMModel(XGLMPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([]) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if dropout_probability < self.layerdrop:
continue continue
past_key_value = past_key_values[idx] if past_key_values is not None else None past_key_value = past_key_values[idx] if past_key_values is not None else None
......
...@@ -21,7 +21,7 @@ import unittest ...@@ -21,7 +21,7 @@ import unittest
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from transformers import is_torch_available from transformers import is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
...@@ -380,6 +380,10 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa ...@@ -380,6 +380,10 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
[self.model_tester.num_attention_heads, encoder_seq_length, dim], [self.model_tester.num_attention_heads, encoder_seq_length, dim],
) )
@is_flaky()
def test_retain_grad_hidden_states_attentions(self):
super().test_retain_grad_hidden_states_attentions()
def prepare_batch(filename="train-batch.pt"): def prepare_batch(filename="train-batch.pt"):
file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset") file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment