Upgrade black to version ~=22.0 (#15565)

* Upgrade black to version ~=22.0 * Check copies * Fix code

Upgrade black to version ~=22.0 (#15565)
* Upgrade black to version ~=22.0 * Check copies * Fix code
7732d0fe · Lysandre Debut · GitHub · d923f762 · 7732d0fe · 7732d0fe
Unverified Commit 7732d0fe authored Feb 09, 2022 by Lysandre Debut Committed by GitHub Feb 09, 2022
20 changed files
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -350,7 +350,7 @@ def get_grad_norm(params, scale=1):
        if p.grad is not None:
            param_norm = (p.grad.detach().data / scale).norm(2)
            total_norm += param_norm.item() ** 2
-    total_norm = total_norm ** 0.5
+    total_norm = total_norm**0.5
    return total_norm
@@ -619,7 +619,7 @@ def main():
                # update gumbel temperature
                gumbel_temperature = max(
-                    args.max_gumbel_temperature * args.gumbel_temperature_decay ** completed_steps,
+                    args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps,
                    args.min_gumbel_temperature,
                )
                if hasattr(model, "module"):

--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -229,20 +229,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                assert end_logits_tea.size() == end_logits_stu.size()
                loss_fct = nn.KLDivLoss(reduction="batchmean")
-                loss_start = (
+                loss_start = loss_fct(
-                    loss_fct(
+                    nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                    nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
-                        nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature**2)
-                    )
+                loss_end = loss_fct(
-                    * (args.temperature ** 2)
+                    nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                )
+                    nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
-                loss_end = (
+                ) * (args.temperature**2)
-                    loss_fct(
-                        nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
-                    )
-                    * (args.temperature ** 2)
-                )
                loss_ce = (loss_start + loss_end) / 2.0
                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss

--- a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
+++ b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
@@ -450,7 +450,7 @@ def main():
            negative_indices = batch.pop("sampled_negative_indices")
            gumbel_temperature = jnp.clip(
-                model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay ** state.step,
+                model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay**state.step,
                a_min=model_args.min_gumbel_temperature,
            )

--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module):
        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
        self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
-        self.stage_channel_factor = 2 ** 3  # res5 is 8x res2
+        self.stage_channel_factor = 2**3  # res5 is 8x res2
        self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor
        # self.proposal_matcher = Matcher(
@@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module):
        anchors = []
        for size in sizes:
-            area = size ** 2.0
+            area = size**2.0
            for aspect_ratio in aspect_ratios:
                w = math.sqrt(area / aspect_ratio)
                h = aspect_ratio * w

--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@@ -84,7 +84,7 @@ def schedule_threshold(
        spars_warmup_steps = initial_warmup * warmup_steps
        spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
        mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
-        threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3)
+        threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3)
    regu_lambda = final_lambda * threshold / final_threshold
    return threshold, regu_lambda
@@ -285,14 +285,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                        attention_mask=inputs["attention_mask"],
                    )
-                loss_logits = (
+                loss_logits = nn.functional.kl_div(
-                    nn.functional.kl_div(
+                    input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1),
-                        input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1),
+                    target=nn.functional.softmax(logits_tea / args.temperature, dim=-1),
-                        target=nn.functional.softmax(logits_tea / args.temperature, dim=-1),
+                    reduction="batchmean",
-                        reduction="batchmean",
+                ) * (args.temperature**2)
-                    )
-                    * (args.temperature ** 2)
-                )
                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss

--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -88,7 +88,7 @@ def schedule_threshold(
        spars_warmup_steps = initial_warmup * warmup_steps
        spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
        mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
-        threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3)
+        threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3)
    regu_lambda = final_lambda * threshold / final_threshold
    return threshold, regu_lambda
@@ -306,22 +306,16 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                        attention_mask=inputs["attention_mask"],
                    )
-                loss_start = (
+                loss_start = nn.functional.kl_div(
-                    nn.functional.kl_div(
+                    input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                    target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
-                        target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
+                    reduction="batchmean",
-                        reduction="batchmean",
+                ) * (args.temperature**2)
-                    )
+                loss_end = nn.functional.kl_div(
-                    * (args.temperature ** 2)
+                    input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                )
+                    target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
-                loss_end = (
+                    reduction="batchmean",
-                    nn.functional.kl_div(
+                ) * (args.temperature**2)
-                        input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
-                        reduction="batchmean",
-                    )
-                    * (args.temperature ** 2)
-                )
                loss_logits = (loss_start + loss_end) / 2.0
                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss

--- a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
+++ b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
@@ -442,7 +442,7 @@ class BeamSearchScorerTS(torch.nn.Module):
        elif self.do_early_stopping:
            return True
        else:
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            cur_score = best_sum_logprobs / cur_len**self.length_penalty
            ret = self._beam_hyps_worst_scores[hypo_idx].item() >= cur_score
            return ret

--- a/examples/research_projects/pplm/run_pplm.py
+++ b/examples/research_projects/pplm/run_pplm.py
@@ -550,7 +550,7 @@ def generate_text_pplm(
            unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
-            pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
+            pert_probs = (pert_probs**gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
            # rescale

--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module):
        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
        self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
-        self.stage_channel_factor = 2 ** 3  # res5 is 8x res2
+        self.stage_channel_factor = 2**3  # res5 is 8x res2
        self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor
        # self.proposal_matcher = Matcher(
@@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module):
        anchors = []
        for size in sizes:
-            area = size ** 2.0
+            area = size**2.0
            for aspect_ratio in aspect_ratios:
                w = math.sqrt(area / aspect_ratio)
                h = aspect_ratio * w

--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ b/examples/research_projects/wav2vec2/run_pretrain.py
@@ -273,11 +273,11 @@ class Wav2Vec2PreTrainer(Trainer):
        # make sure gumbel softmax temperature is decayed
        if self.args.n_gpu > 1 or self.deepspeed:
            model.module.set_gumbel_temperature(
-                max(self.max_gumbel_temp * self.gumbel_temp_decay ** self.num_update_step, self.min_gumbel_temp)
+                max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp)
            )
        else:
            model.set_gumbel_temperature(
-                max(self.max_gumbel_temp * self.gumbel_temp_decay ** self.num_update_step, self.min_gumbel_temp)
+                max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp)
            )
        return loss.detach()

--- a/setup.py
+++ b/setup.py
@@ -93,7 +93,7 @@ if stale_egg_info.exists():
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
    "Pillow",
-    "black==21.4b0",
+    "black~=22.0",
    "codecarbon==1.2.0",
    "cookiecutter==1.7.2",
    "dataclasses",
@@ -166,7 +166,7 @@ _deps = [
 # packaging: "packaging"
 #
 # some of the values are versioned whereas others aren't.
-deps = {b: a for a, b in (re.findall(r"^(([^!=<>]+)(?:[!=<>].*)?$)", x)[0] for x in _deps)}
+deps = {b: a for a, b in (re.findall(r"^(([^!=<>~]+)(?:[!=<>~].*)?$)", x)[0] for x in _deps)}
 # since we save this data in src/transformers/dependency_versions_table.py it can be easily accessed from
 # anywhere. If you need to quickly access the data from this table in a shell, you can do so easily with:

--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -292,7 +292,7 @@ def replace_model_patterns(
        attributes_to_check.append("model_type")
    else:
        text = re.sub(
-            fr'(\s*)model_type = "{old_model_patterns.model_type}"',
+            rf'(\s*)model_type = "{old_model_patterns.model_type}"',
            r'\1model_type = "[MODEL_TYPE]"',
            text,
        )
@@ -301,8 +301,8 @@ def replace_model_patterns(
    # not the new one. We can't just do a replace in all the text and will need a special regex
    if old_model_patterns.model_upper_cased == old_model_patterns.model_camel_cased:
        old_model_value = old_model_patterns.model_upper_cased
-        if re.search(fr"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None:
+        if re.search(rf"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None:
-            text = re.sub(fr"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text)
+            text = re.sub(rf"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text)
    else:
        attributes_to_check.append("model_upper_cased")
@@ -750,8 +750,8 @@ def clean_frameworks_in_init(
        return
    remove_pattern = "|".join(to_remove)
-    re_conditional_imports = re.compile(fr"^\s*if is_({remove_pattern})_available\(\):\s*$")
+    re_conditional_imports = re.compile(rf"^\s*if is_({remove_pattern})_available\(\):\s*$")
-    re_is_xxx_available = re.compile(fr"is_({remove_pattern})_available")
+    re_is_xxx_available = re.compile(rf"is_({remove_pattern})_available")
    with open(init_file, "r", encoding="utf-8") as f:
        content = f.read()
@@ -831,7 +831,7 @@ def add_model_to_main_init(
        if framework is not None and frameworks is not None and framework not in frameworks:
            new_lines.append(lines[idx])
            idx += 1
-        elif re.search(fr'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None:
+        elif re.search(rf'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None:
            block = [lines[idx]]
            indent = find_indent(lines[idx])
            idx += 1

--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
    "Pillow": "Pillow",
-    "black": "black==21.4b0",
+    "black": "black~=22.0",
    "codecarbon": "codecarbon==1.2.0",
    "cookiecutter": "cookiecutter==1.7.2",
    "dataclasses": "dataclasses",

--- a/src/transformers/generation_beam_search.py
+++ b/src/transformers/generation_beam_search.py
@@ -392,6 +392,6 @@ class BeamHypotheses:
        elif self.early_stopping:
            return True
        else:
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            cur_score = best_sum_logprobs / cur_len**self.length_penalty
            ret = self.worst_score >= cur_score
            return ret
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -679,7 +679,7 @@ class FlaxGenerationMixin:
            not_max_length_yet = state.cur_len < max_length
            # 2. can the new beams still improve?
-            best_running_score = state.running_scores[:, -1:] / (max_length ** length_penalty)
+            best_running_score = state.running_scores[:, -1:] / (max_length**length_penalty)
            worst_finished_score = jnp.where(
                state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7)
            )
@@ -769,7 +769,7 @@ class FlaxGenerationMixin:
            # - add length penalty
            # - make sure no scores can be added anymore if beam is full
            # - make sure still running sequences cannot be chosen as finalized beam
-            topk_log_probs = topk_log_probs / (state.cur_len ** length_penalty)
+            topk_log_probs = topk_log_probs / (state.cur_len**length_penalty)
            beams_in_batch_are_full = (
                jnp.broadcast_to(state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape)
                & early_stopping

--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -1694,6 +1694,6 @@ class BeamHypotheses(object):
        elif self.early_stopping:
            return True
        else:
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            cur_score = best_sum_logprobs / cur_len**self.length_penalty
            ret = self.worst_score >= cur_score
            return ret
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1827,7 +1827,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
-        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
+        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
    def build(self, input_shape):
        """

--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -146,7 +146,7 @@ class BartAttention(nn.Module):
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -152,7 +152,7 @@ class TFBartAttention(tf.keras.layers.Layer):
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")

--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -75,10 +75,10 @@ def bytes_to_unicode():
    )
    cs = bs[:]
    n = 0
-    for b in range(2 ** 8):
+    for b in range(2**8):
        if b not in bs:
            bs.append(b)
-            cs.append(2 ** 8 + n)
+            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))