Unverified Commit 7732d0fe authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Upgrade black to version ~=22.0 (#15565)

* Upgrade black to version ~=22.0

* Check copies

* Fix code
parent d923f762
......@@ -766,7 +766,7 @@ class LEDDecoderAttention(nn.Module):
assert (
self.head_dim * num_heads == self.embed_dim
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -998,7 +998,7 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
......@@ -405,13 +405,10 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se
else:
# last token is separation token and should not be counted and in the middle are two separation tokens
question_end_index = tf.tile(question_end_index + 1, (1, input_ids_shape[1]))
attention_mask = (
tf.cast(
attention_mask > question_end_index,
dtype=question_end_index.dtype,
)
* tf.cast(attention_mask < input_ids_shape[-1], dtype=question_end_index.dtype)
)
attention_mask = tf.cast(
attention_mask > question_end_index,
dtype=question_end_index.dtype,
) * tf.cast(attention_mask < input_ids_shape[-1], dtype=question_end_index.dtype)
return attention_mask
......
......@@ -217,7 +217,7 @@ class M2M100Attention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -163,7 +163,7 @@ class MarianAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -194,7 +194,7 @@ class TFMarianAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
......@@ -152,7 +152,7 @@ class MBartAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -154,7 +154,7 @@ class TFMBartAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
......@@ -163,7 +163,7 @@ class PegasusAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -195,7 +195,7 @@ class TFPegasusAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
......@@ -824,7 +824,7 @@ class PerceiverModel(PerceiverPreTrainedModel):
... project_pos_dim=256,
... trainable_position_encoding_kwargs=dict(
... num_channels=256,
... index_dims=config.image_size ** 2,
... index_dims=config.image_size**2,
... ),
... )
......@@ -1205,7 +1205,7 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
def __init__(self, config):
super().__init__(config)
trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size ** 2)
trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size**2)
trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1)
self.num_labels = config.num_labels
......@@ -2485,7 +2485,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b
batch_size,
height // spatial_block_size,
width // spatial_block_size,
(spatial_block_size ** 2) * num_channels,
(spatial_block_size**2) * num_channels,
)
return frames
elif len(frames.shape) == 5:
......@@ -2509,7 +2509,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b
time // temporal_block_size,
height // spatial_block_size,
width // spatial_block_size,
temporal_block_size * (spatial_block_size ** 2) * num_channels,
temporal_block_size * (spatial_block_size**2) * num_channels,
)
return frames
else:
......@@ -3059,7 +3059,7 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
if self.conv_after_patching:
inp_dim = self.out_channels
else:
inp_dim = self.in_channels * self.spatial_downsample ** 2
inp_dim = self.in_channels * self.spatial_downsample**2
if is_temporal:
inp_dim *= self.temporal_downsample
......
......@@ -87,7 +87,7 @@ class PerceiverTokenizer(PreTrainedTokenizer):
**kwargs,
)
self._utf_vocab_size = 2 ** 8 # utf is 8 bits
self._utf_vocab_size = 2**8 # utf is 8 bits
# define special tokens dict
self.special_tokens_encoder: Dict[str, int] = {
......
......@@ -674,7 +674,7 @@ class ProphetNetAttention(nn.Module):
], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}"
# previous time steps are cached - no need to recompute key and value if they are static
query_states = self.query_proj(hidden_states) / (self.head_dim ** 0.5)
query_states = self.query_proj(hidden_states) / (self.head_dim**0.5)
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
......@@ -855,7 +855,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
value_states = self.value_proj(hidden_states)
# normalize
query_states = query_states / (self.head_dim ** 0.5)
query_states = query_states / (self.head_dim**0.5)
# reshape
query_states = self._shape(query_states, ngram_sequence_length, batch_size)
......
......@@ -700,7 +700,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
# `num_buckets` should be set to 2 * sequence_length // chunk_length as recommended in paper
num_buckets_pow_2 = (2 * (sequence_length // self.chunk_length)).bit_length() - 1
# make sure buckets are power of 2
num_buckets = 2 ** num_buckets_pow_2
num_buckets = 2**num_buckets_pow_2
# factorize `num_buckets` if `num_buckets` becomes too large
num_buckets_limit = 2 * max(
......@@ -966,7 +966,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
"""
length normalization
"""
variance = torch.mean(x ** 2, -1, keepdim=True)
variance = torch.mean(x**2, -1, keepdim=True)
norm_x = x * torch.rsqrt(variance + epsilon)
return norm_x
......
......@@ -77,10 +77,10 @@ def bytes_to_unicode():
)
cs = bs[:]
n = 0
for b in range(2 ** 8):
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2 ** 8 + n)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
......
......@@ -420,7 +420,7 @@ class SEWAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -86,7 +86,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
and hence the waveform should not be normalized before feature extraction.
"""
waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers
waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers
waveform = torch.from_numpy(waveform).unsqueeze(0)
features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
return features.numpy()
......
......@@ -230,7 +230,7 @@ class Speech2TextAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -256,7 +256,7 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
......@@ -170,7 +170,7 @@ class Speech2Text2Attention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment