Unverified Commit 7732d0fe authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Upgrade black to version ~=22.0 (#15565)

* Upgrade black to version ~=22.0

* Check copies

* Fix code
parent d923f762
...@@ -766,7 +766,7 @@ class LEDDecoderAttention(nn.Module): ...@@ -766,7 +766,7 @@ class LEDDecoderAttention(nn.Module):
assert ( assert (
self.head_dim * num_heads == self.embed_dim self.head_dim * num_heads == self.embed_dim
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -998,7 +998,7 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer): ...@@ -998,7 +998,7 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
...@@ -405,13 +405,10 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se ...@@ -405,13 +405,10 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se
else: else:
# last token is separation token and should not be counted and in the middle are two separation tokens # last token is separation token and should not be counted and in the middle are two separation tokens
question_end_index = tf.tile(question_end_index + 1, (1, input_ids_shape[1])) question_end_index = tf.tile(question_end_index + 1, (1, input_ids_shape[1]))
attention_mask = ( attention_mask = tf.cast(
tf.cast(
attention_mask > question_end_index, attention_mask > question_end_index,
dtype=question_end_index.dtype, dtype=question_end_index.dtype,
) ) * tf.cast(attention_mask < input_ids_shape[-1], dtype=question_end_index.dtype)
* tf.cast(attention_mask < input_ids_shape[-1], dtype=question_end_index.dtype)
)
return attention_mask return attention_mask
......
...@@ -217,7 +217,7 @@ class M2M100Attention(nn.Module): ...@@ -217,7 +217,7 @@ class M2M100Attention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -163,7 +163,7 @@ class MarianAttention(nn.Module): ...@@ -163,7 +163,7 @@ class MarianAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -194,7 +194,7 @@ class TFMarianAttention(tf.keras.layers.Layer): ...@@ -194,7 +194,7 @@ class TFMarianAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
...@@ -152,7 +152,7 @@ class MBartAttention(nn.Module): ...@@ -152,7 +152,7 @@ class MBartAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -154,7 +154,7 @@ class TFMBartAttention(tf.keras.layers.Layer): ...@@ -154,7 +154,7 @@ class TFMBartAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
...@@ -163,7 +163,7 @@ class PegasusAttention(nn.Module): ...@@ -163,7 +163,7 @@ class PegasusAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -195,7 +195,7 @@ class TFPegasusAttention(tf.keras.layers.Layer): ...@@ -195,7 +195,7 @@ class TFPegasusAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
...@@ -824,7 +824,7 @@ class PerceiverModel(PerceiverPreTrainedModel): ...@@ -824,7 +824,7 @@ class PerceiverModel(PerceiverPreTrainedModel):
... project_pos_dim=256, ... project_pos_dim=256,
... trainable_position_encoding_kwargs=dict( ... trainable_position_encoding_kwargs=dict(
... num_channels=256, ... num_channels=256,
... index_dims=config.image_size ** 2, ... index_dims=config.image_size**2,
... ), ... ),
... ) ... )
...@@ -1205,7 +1205,7 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel): ...@@ -1205,7 +1205,7 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size ** 2) trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size**2)
trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1) trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -2485,7 +2485,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b ...@@ -2485,7 +2485,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b
batch_size, batch_size,
height // spatial_block_size, height // spatial_block_size,
width // spatial_block_size, width // spatial_block_size,
(spatial_block_size ** 2) * num_channels, (spatial_block_size**2) * num_channels,
) )
return frames return frames
elif len(frames.shape) == 5: elif len(frames.shape) == 5:
...@@ -2509,7 +2509,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b ...@@ -2509,7 +2509,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b
time // temporal_block_size, time // temporal_block_size,
height // spatial_block_size, height // spatial_block_size,
width // spatial_block_size, width // spatial_block_size,
temporal_block_size * (spatial_block_size ** 2) * num_channels, temporal_block_size * (spatial_block_size**2) * num_channels,
) )
return frames return frames
else: else:
...@@ -3059,7 +3059,7 @@ class PerceiverImagePreprocessor(AbstractPreprocessor): ...@@ -3059,7 +3059,7 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
if self.conv_after_patching: if self.conv_after_patching:
inp_dim = self.out_channels inp_dim = self.out_channels
else: else:
inp_dim = self.in_channels * self.spatial_downsample ** 2 inp_dim = self.in_channels * self.spatial_downsample**2
if is_temporal: if is_temporal:
inp_dim *= self.temporal_downsample inp_dim *= self.temporal_downsample
......
...@@ -87,7 +87,7 @@ class PerceiverTokenizer(PreTrainedTokenizer): ...@@ -87,7 +87,7 @@ class PerceiverTokenizer(PreTrainedTokenizer):
**kwargs, **kwargs,
) )
self._utf_vocab_size = 2 ** 8 # utf is 8 bits self._utf_vocab_size = 2**8 # utf is 8 bits
# define special tokens dict # define special tokens dict
self.special_tokens_encoder: Dict[str, int] = { self.special_tokens_encoder: Dict[str, int] = {
......
...@@ -674,7 +674,7 @@ class ProphetNetAttention(nn.Module): ...@@ -674,7 +674,7 @@ class ProphetNetAttention(nn.Module):
], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}" ], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}"
# previous time steps are cached - no need to recompute key and value if they are static # previous time steps are cached - no need to recompute key and value if they are static
query_states = self.query_proj(hidden_states) / (self.head_dim ** 0.5) query_states = self.query_proj(hidden_states) / (self.head_dim**0.5)
if is_cross_attention and past_key_value is not None: if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions # reuse k,v, cross_attentions
...@@ -855,7 +855,7 @@ class ProphetNetNgramSelfAttention(nn.Module): ...@@ -855,7 +855,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
value_states = self.value_proj(hidden_states) value_states = self.value_proj(hidden_states)
# normalize # normalize
query_states = query_states / (self.head_dim ** 0.5) query_states = query_states / (self.head_dim**0.5)
# reshape # reshape
query_states = self._shape(query_states, ngram_sequence_length, batch_size) query_states = self._shape(query_states, ngram_sequence_length, batch_size)
......
...@@ -700,7 +700,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): ...@@ -700,7 +700,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
# `num_buckets` should be set to 2 * sequence_length // chunk_length as recommended in paper # `num_buckets` should be set to 2 * sequence_length // chunk_length as recommended in paper
num_buckets_pow_2 = (2 * (sequence_length // self.chunk_length)).bit_length() - 1 num_buckets_pow_2 = (2 * (sequence_length // self.chunk_length)).bit_length() - 1
# make sure buckets are power of 2 # make sure buckets are power of 2
num_buckets = 2 ** num_buckets_pow_2 num_buckets = 2**num_buckets_pow_2
# factorize `num_buckets` if `num_buckets` becomes too large # factorize `num_buckets` if `num_buckets` becomes too large
num_buckets_limit = 2 * max( num_buckets_limit = 2 * max(
...@@ -966,7 +966,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): ...@@ -966,7 +966,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
""" """
length normalization length normalization
""" """
variance = torch.mean(x ** 2, -1, keepdim=True) variance = torch.mean(x**2, -1, keepdim=True)
norm_x = x * torch.rsqrt(variance + epsilon) norm_x = x * torch.rsqrt(variance + epsilon)
return norm_x return norm_x
......
...@@ -77,10 +77,10 @@ def bytes_to_unicode(): ...@@ -77,10 +77,10 @@ def bytes_to_unicode():
) )
cs = bs[:] cs = bs[:]
n = 0 n = 0
for b in range(2 ** 8): for b in range(2**8):
if b not in bs: if b not in bs:
bs.append(b) bs.append(b)
cs.append(2 ** 8 + n) cs.append(2**8 + n)
n += 1 n += 1
cs = [chr(n) for n in cs] cs = [chr(n) for n in cs]
return dict(zip(bs, cs)) return dict(zip(bs, cs))
......
...@@ -420,7 +420,7 @@ class SEWAttention(nn.Module): ...@@ -420,7 +420,7 @@ class SEWAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -86,7 +86,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): ...@@ -86,7 +86,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
and hence the waveform should not be normalized before feature extraction. and hence the waveform should not be normalized before feature extraction.
""" """
waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers
waveform = torch.from_numpy(waveform).unsqueeze(0) waveform = torch.from_numpy(waveform).unsqueeze(0)
features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate) features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
return features.numpy() return features.numpy()
......
...@@ -230,7 +230,7 @@ class Speech2TextAttention(nn.Module): ...@@ -230,7 +230,7 @@ class Speech2TextAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -256,7 +256,7 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer): ...@@ -256,7 +256,7 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
...@@ -170,7 +170,7 @@ class Speech2Text2Attention(nn.Module): ...@@ -170,7 +170,7 @@ class Speech2Text2Attention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment