Unverified Commit a28325e2 authored by Bowen Bao's avatar Bowen Bao Committed by GitHub
Browse files

Replace python random with torch.rand to enable dynamo.export (#24434)

* Replace python random with torch.rand to enable dynamo.export

* revert changes to flax model code

* Remove unused random import

* Fix torch template

* Move torch.manual_seed(0) to right location
parent c036c814
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
""" PyTorch Autoformer model.""" """ PyTorch Autoformer model."""
import math import math
import random
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -1198,7 +1197,7 @@ class AutoformerEncoder(AutoformerPreTrainedModel): ...@@ -1198,7 +1197,7 @@ class AutoformerEncoder(AutoformerPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1408,7 +1407,7 @@ class AutoformerDecoder(AutoformerPreTrainedModel): ...@@ -1408,7 +1407,7 @@ class AutoformerDecoder(AutoformerPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
""" PyTorch BART model.""" """ PyTorch BART model."""
import copy import copy
import math import math
import random
import warnings import warnings
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -837,7 +836,7 @@ class BartEncoder(BartPretrainedModel): ...@@ -837,7 +836,7 @@ class BartEncoder(BartPretrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1090,7 +1089,7 @@ class BartDecoder(BartPretrainedModel): ...@@ -1090,7 +1089,7 @@ class BartDecoder(BartPretrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import copy import copy
import math import math
import random
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -1933,7 +1932,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): ...@@ -1933,7 +1932,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -2276,7 +2275,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): ...@@ -2276,7 +2275,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import math import math
import random
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import torch import torch
...@@ -579,7 +578,7 @@ class BioGptModel(BioGptPreTrainedModel): ...@@ -579,7 +578,7 @@ class BioGptModel(BioGptPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
import copy import copy
import math import math
import os import os
import random
import warnings import warnings
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -767,7 +766,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel): ...@@ -767,7 +766,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1019,7 +1018,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): ...@@ -1019,7 +1018,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import copy import copy
import math import math
import random
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import torch import torch
...@@ -765,7 +764,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel): ...@@ -765,7 +764,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1016,7 +1015,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): ...@@ -1016,7 +1015,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import math import math
import random
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
...@@ -1224,7 +1223,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel): ...@@ -1224,7 +1223,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1378,7 +1377,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel): ...@@ -1378,7 +1377,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
if idx == 0: if idx == 0:
......
...@@ -587,7 +587,7 @@ class Data2VecAudioEncoder(nn.Module): ...@@ -587,7 +587,7 @@ class Data2VecAudioEncoder(nn.Module):
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = np.random.uniform(0, 1) dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import math import math
import random
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
...@@ -979,7 +978,7 @@ class DetrEncoder(DetrPreTrainedModel): ...@@ -979,7 +978,7 @@ class DetrEncoder(DetrPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1118,7 +1117,7 @@ class DetrDecoder(DetrPreTrainedModel): ...@@ -1118,7 +1117,7 @@ class DetrDecoder(DetrPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import itertools import itertools
import math import math
import random
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union from typing import Dict, Optional, Tuple, Union
...@@ -580,7 +579,7 @@ class FlaubertModel(FlaubertPreTrainedModel): ...@@ -580,7 +579,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
attentions = () if output_attentions else None attentions = () if output_attentions else None
for i in range(self.n_layers): for i in range(self.n_layers):
# LayerDrop # LayerDrop
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -28,7 +28,6 @@ ...@@ -28,7 +28,6 @@
"""PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19""" """PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19"""
import math import math
import random
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
import torch import torch
...@@ -550,7 +549,7 @@ class FSMTEncoder(nn.Module): ...@@ -550,7 +549,7 @@ class FSMTEncoder(nn.Module):
encoder_states += (x,) encoder_states += (x,)
x = x.transpose(0, 1) # B x T x C -> T x B x C x = x.transpose(0, 1) # B x T x C -> T x B x C
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
attn = None attn = None
else: else:
...@@ -794,7 +793,7 @@ class FSMTDecoder(nn.Module): ...@@ -794,7 +793,7 @@ class FSMTDecoder(nn.Module):
x = x.transpose(0, 1) x = x.transpose(0, 1)
all_hidden_states += (x,) all_hidden_states += (x,)
x = x.transpose(0, 1) x = x.transpose(0, 1)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -725,7 +725,7 @@ class HubertEncoder(nn.Module): ...@@ -725,7 +725,7 @@ class HubertEncoder(nn.Module):
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = np.random.uniform(0, 1) dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
...@@ -814,7 +814,7 @@ class HubertEncoderStableLayerNorm(nn.Module): ...@@ -814,7 +814,7 @@ class HubertEncoderStableLayerNorm(nn.Module):
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = np.random.uniform(0, 1) dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
""" PyTorch Informer model.""" """ PyTorch Informer model."""
import random
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -1205,7 +1204,7 @@ class InformerEncoder(InformerPreTrainedModel): ...@@ -1205,7 +1204,7 @@ class InformerEncoder(InformerPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1425,7 +1424,7 @@ class InformerDecoder(InformerPreTrainedModel): ...@@ -1425,7 +1424,7 @@ class InformerDecoder(InformerPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import math import math
import random
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -1871,7 +1870,7 @@ class LEDEncoder(LEDPreTrainedModel): ...@@ -1871,7 +1870,7 @@ class LEDEncoder(LEDPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None, None) layer_outputs = (None, None, None)
...@@ -2135,7 +2134,7 @@ class LEDDecoder(LEDPreTrainedModel): ...@@ -2135,7 +2134,7 @@ class LEDDecoder(LEDPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import math import math
import random
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import torch import torch
...@@ -813,7 +812,7 @@ class M2M100Encoder(M2M100PreTrainedModel): ...@@ -813,7 +812,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
...@@ -1057,7 +1056,7 @@ class M2M100Decoder(M2M100PreTrainedModel): ...@@ -1057,7 +1056,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import copy import copy
import math import math
import random
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -778,7 +777,7 @@ class MarianEncoder(MarianPreTrainedModel): ...@@ -778,7 +777,7 @@ class MarianEncoder(MarianPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1024,7 +1023,7 @@ class MarianDecoder(MarianPreTrainedModel): ...@@ -1024,7 +1023,7 @@ class MarianDecoder(MarianPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
""" PyTorch Mask2Former model.""" """ PyTorch Mask2Former model."""
import math import math
import random
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
...@@ -1862,7 +1861,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module): ...@@ -1862,7 +1861,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
""" PyTorch MaskFormer model.""" """ PyTorch MaskFormer model."""
import math import math
import random
from dataclasses import dataclass from dataclasses import dataclass
from numbers import Number from numbers import Number
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
...@@ -764,7 +763,7 @@ class DetrDecoder(nn.Module): ...@@ -764,7 +763,7 @@ class DetrDecoder(nn.Module):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
""" PyTorch MBART model.""" """ PyTorch MBART model."""
import copy import copy
import math import math
import random
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import torch import torch
...@@ -819,7 +818,7 @@ class MBartEncoder(MBartPreTrainedModel): ...@@ -819,7 +818,7 @@ class MBartEncoder(MBartPreTrainedModel):
if output_hidden_states: if output_hidden_states:
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None) layer_outputs = (None, None)
else: else:
...@@ -1074,7 +1073,7 @@ class MBartDecoder(MBartPreTrainedModel): ...@@ -1074,7 +1073,7 @@ class MBartDecoder(MBartPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import math import math
import random
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import torch import torch
...@@ -610,7 +609,7 @@ class MCTCTEncoder(MCTCTPreTrainedModel): ...@@ -610,7 +609,7 @@ class MCTCTEncoder(MCTCTPreTrainedModel):
encoder_states = encoder_states + (hidden_states,) encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = torch.rand([])
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment