Unverified Commit 7732d0fe authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Upgrade black to version ~=22.0 (#15565)

* Upgrade black to version ~=22.0

* Check copies

* Fix code
parent d923f762
...@@ -350,7 +350,7 @@ def get_grad_norm(params, scale=1): ...@@ -350,7 +350,7 @@ def get_grad_norm(params, scale=1):
if p.grad is not None: if p.grad is not None:
param_norm = (p.grad.detach().data / scale).norm(2) param_norm = (p.grad.detach().data / scale).norm(2)
total_norm += param_norm.item() ** 2 total_norm += param_norm.item() ** 2
total_norm = total_norm ** 0.5 total_norm = total_norm**0.5
return total_norm return total_norm
...@@ -619,7 +619,7 @@ def main(): ...@@ -619,7 +619,7 @@ def main():
# update gumbel temperature # update gumbel temperature
gumbel_temperature = max( gumbel_temperature = max(
args.max_gumbel_temperature * args.gumbel_temperature_decay ** completed_steps, args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps,
args.min_gumbel_temperature, args.min_gumbel_temperature,
) )
if hasattr(model, "module"): if hasattr(model, "module"):
......
...@@ -229,20 +229,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None): ...@@ -229,20 +229,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
assert end_logits_tea.size() == end_logits_stu.size() assert end_logits_tea.size() == end_logits_stu.size()
loss_fct = nn.KLDivLoss(reduction="batchmean") loss_fct = nn.KLDivLoss(reduction="batchmean")
loss_start = ( loss_start = loss_fct(
loss_fct( nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1), nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
nn.functional.softmax(start_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2)
) loss_end = loss_fct(
* (args.temperature ** 2) nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
) nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
loss_end = ( ) * (args.temperature**2)
loss_fct(
nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
)
* (args.temperature ** 2)
)
loss_ce = (loss_start + loss_end) / 2.0 loss_ce = (loss_start + loss_end) / 2.0
loss = args.alpha_ce * loss_ce + args.alpha_squad * loss loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
......
...@@ -450,7 +450,7 @@ def main(): ...@@ -450,7 +450,7 @@ def main():
negative_indices = batch.pop("sampled_negative_indices") negative_indices = batch.pop("sampled_negative_indices")
gumbel_temperature = jnp.clip( gumbel_temperature = jnp.clip(
model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay ** state.step, model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay**state.step,
a_min=model_args.min_gumbel_temperature, a_min=model_args.min_gumbel_temperature,
) )
......
...@@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module): ...@@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module):
self.feature_strides = {k: v.stride for k, v in input_shape.items()} self.feature_strides = {k: v.stride for k, v in input_shape.items()}
self.feature_channels = {k: v.channels for k, v in input_shape.items()} self.feature_channels = {k: v.channels for k, v in input_shape.items()}
self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
self.stage_channel_factor = 2 ** 3 # res5 is 8x res2 self.stage_channel_factor = 2**3 # res5 is 8x res2
self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor
# self.proposal_matcher = Matcher( # self.proposal_matcher = Matcher(
...@@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module): ...@@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module):
anchors = [] anchors = []
for size in sizes: for size in sizes:
area = size ** 2.0 area = size**2.0
for aspect_ratio in aspect_ratios: for aspect_ratio in aspect_ratios:
w = math.sqrt(area / aspect_ratio) w = math.sqrt(area / aspect_ratio)
h = aspect_ratio * w h = aspect_ratio * w
......
...@@ -84,7 +84,7 @@ def schedule_threshold( ...@@ -84,7 +84,7 @@ def schedule_threshold(
spars_warmup_steps = initial_warmup * warmup_steps spars_warmup_steps = initial_warmup * warmup_steps
spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps) mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3) threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3)
regu_lambda = final_lambda * threshold / final_threshold regu_lambda = final_lambda * threshold / final_threshold
return threshold, regu_lambda return threshold, regu_lambda
...@@ -285,14 +285,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None): ...@@ -285,14 +285,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
attention_mask=inputs["attention_mask"], attention_mask=inputs["attention_mask"],
) )
loss_logits = ( loss_logits = nn.functional.kl_div(
nn.functional.kl_div( input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1),
input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1), target=nn.functional.softmax(logits_tea / args.temperature, dim=-1),
target=nn.functional.softmax(logits_tea / args.temperature, dim=-1), reduction="batchmean",
reduction="batchmean", ) * (args.temperature**2)
)
* (args.temperature ** 2)
)
loss = args.alpha_distil * loss_logits + args.alpha_ce * loss loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
......
...@@ -88,7 +88,7 @@ def schedule_threshold( ...@@ -88,7 +88,7 @@ def schedule_threshold(
spars_warmup_steps = initial_warmup * warmup_steps spars_warmup_steps = initial_warmup * warmup_steps
spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps) mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3) threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3)
regu_lambda = final_lambda * threshold / final_threshold regu_lambda = final_lambda * threshold / final_threshold
return threshold, regu_lambda return threshold, regu_lambda
...@@ -306,22 +306,16 @@ def train(args, train_dataset, model, tokenizer, teacher=None): ...@@ -306,22 +306,16 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
attention_mask=inputs["attention_mask"], attention_mask=inputs["attention_mask"],
) )
loss_start = ( loss_start = nn.functional.kl_div(
nn.functional.kl_div( input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1), target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1), reduction="batchmean",
reduction="batchmean", ) * (args.temperature**2)
) loss_end = nn.functional.kl_div(
* (args.temperature ** 2) input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
) target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
loss_end = ( reduction="batchmean",
nn.functional.kl_div( ) * (args.temperature**2)
input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
reduction="batchmean",
)
* (args.temperature ** 2)
)
loss_logits = (loss_start + loss_end) / 2.0 loss_logits = (loss_start + loss_end) / 2.0
loss = args.alpha_distil * loss_logits + args.alpha_ce * loss loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
......
...@@ -442,7 +442,7 @@ class BeamSearchScorerTS(torch.nn.Module): ...@@ -442,7 +442,7 @@ class BeamSearchScorerTS(torch.nn.Module):
elif self.do_early_stopping: elif self.do_early_stopping:
return True return True
else: else:
cur_score = best_sum_logprobs / cur_len ** self.length_penalty cur_score = best_sum_logprobs / cur_len**self.length_penalty
ret = self._beam_hyps_worst_scores[hypo_idx].item() >= cur_score ret = self._beam_hyps_worst_scores[hypo_idx].item() >= cur_score
return ret return ret
......
...@@ -550,7 +550,7 @@ def generate_text_pplm( ...@@ -550,7 +550,7 @@ def generate_text_pplm(
unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1) unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale)) # + SMALL_CONST pert_probs = (pert_probs**gm_scale) * (unpert_probs ** (1 - gm_scale)) # + SMALL_CONST
pert_probs = top_k_filter(pert_probs, k=top_k, probs=True) # + SMALL_CONST pert_probs = top_k_filter(pert_probs, k=top_k, probs=True) # + SMALL_CONST
# rescale # rescale
......
...@@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module): ...@@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module):
self.feature_strides = {k: v.stride for k, v in input_shape.items()} self.feature_strides = {k: v.stride for k, v in input_shape.items()}
self.feature_channels = {k: v.channels for k, v in input_shape.items()} self.feature_channels = {k: v.channels for k, v in input_shape.items()}
self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
self.stage_channel_factor = 2 ** 3 # res5 is 8x res2 self.stage_channel_factor = 2**3 # res5 is 8x res2
self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor
# self.proposal_matcher = Matcher( # self.proposal_matcher = Matcher(
...@@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module): ...@@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module):
anchors = [] anchors = []
for size in sizes: for size in sizes:
area = size ** 2.0 area = size**2.0
for aspect_ratio in aspect_ratios: for aspect_ratio in aspect_ratios:
w = math.sqrt(area / aspect_ratio) w = math.sqrt(area / aspect_ratio)
h = aspect_ratio * w h = aspect_ratio * w
......
...@@ -273,11 +273,11 @@ class Wav2Vec2PreTrainer(Trainer): ...@@ -273,11 +273,11 @@ class Wav2Vec2PreTrainer(Trainer):
# make sure gumbel softmax temperature is decayed # make sure gumbel softmax temperature is decayed
if self.args.n_gpu > 1 or self.deepspeed: if self.args.n_gpu > 1 or self.deepspeed:
model.module.set_gumbel_temperature( model.module.set_gumbel_temperature(
max(self.max_gumbel_temp * self.gumbel_temp_decay ** self.num_update_step, self.min_gumbel_temp) max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp)
) )
else: else:
model.set_gumbel_temperature( model.set_gumbel_temperature(
max(self.max_gumbel_temp * self.gumbel_temp_decay ** self.num_update_step, self.min_gumbel_temp) max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp)
) )
return loss.detach() return loss.detach()
......
...@@ -93,7 +93,7 @@ if stale_egg_info.exists(): ...@@ -93,7 +93,7 @@ if stale_egg_info.exists():
# 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
_deps = [ _deps = [
"Pillow", "Pillow",
"black==21.4b0", "black~=22.0",
"codecarbon==1.2.0", "codecarbon==1.2.0",
"cookiecutter==1.7.2", "cookiecutter==1.7.2",
"dataclasses", "dataclasses",
...@@ -166,7 +166,7 @@ _deps = [ ...@@ -166,7 +166,7 @@ _deps = [
# packaging: "packaging" # packaging: "packaging"
# #
# some of the values are versioned whereas others aren't. # some of the values are versioned whereas others aren't.
deps = {b: a for a, b in (re.findall(r"^(([^!=<>]+)(?:[!=<>].*)?$)", x)[0] for x in _deps)} deps = {b: a for a, b in (re.findall(r"^(([^!=<>~]+)(?:[!=<>~].*)?$)", x)[0] for x in _deps)}
# since we save this data in src/transformers/dependency_versions_table.py it can be easily accessed from # since we save this data in src/transformers/dependency_versions_table.py it can be easily accessed from
# anywhere. If you need to quickly access the data from this table in a shell, you can do so easily with: # anywhere. If you need to quickly access the data from this table in a shell, you can do so easily with:
......
...@@ -292,7 +292,7 @@ def replace_model_patterns( ...@@ -292,7 +292,7 @@ def replace_model_patterns(
attributes_to_check.append("model_type") attributes_to_check.append("model_type")
else: else:
text = re.sub( text = re.sub(
fr'(\s*)model_type = "{old_model_patterns.model_type}"', rf'(\s*)model_type = "{old_model_patterns.model_type}"',
r'\1model_type = "[MODEL_TYPE]"', r'\1model_type = "[MODEL_TYPE]"',
text, text,
) )
...@@ -301,8 +301,8 @@ def replace_model_patterns( ...@@ -301,8 +301,8 @@ def replace_model_patterns(
# not the new one. We can't just do a replace in all the text and will need a special regex # not the new one. We can't just do a replace in all the text and will need a special regex
if old_model_patterns.model_upper_cased == old_model_patterns.model_camel_cased: if old_model_patterns.model_upper_cased == old_model_patterns.model_camel_cased:
old_model_value = old_model_patterns.model_upper_cased old_model_value = old_model_patterns.model_upper_cased
if re.search(fr"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None: if re.search(rf"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None:
text = re.sub(fr"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text) text = re.sub(rf"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text)
else: else:
attributes_to_check.append("model_upper_cased") attributes_to_check.append("model_upper_cased")
...@@ -750,8 +750,8 @@ def clean_frameworks_in_init( ...@@ -750,8 +750,8 @@ def clean_frameworks_in_init(
return return
remove_pattern = "|".join(to_remove) remove_pattern = "|".join(to_remove)
re_conditional_imports = re.compile(fr"^\s*if is_({remove_pattern})_available\(\):\s*$") re_conditional_imports = re.compile(rf"^\s*if is_({remove_pattern})_available\(\):\s*$")
re_is_xxx_available = re.compile(fr"is_({remove_pattern})_available") re_is_xxx_available = re.compile(rf"is_({remove_pattern})_available")
with open(init_file, "r", encoding="utf-8") as f: with open(init_file, "r", encoding="utf-8") as f:
content = f.read() content = f.read()
...@@ -831,7 +831,7 @@ def add_model_to_main_init( ...@@ -831,7 +831,7 @@ def add_model_to_main_init(
if framework is not None and frameworks is not None and framework not in frameworks: if framework is not None and frameworks is not None and framework not in frameworks:
new_lines.append(lines[idx]) new_lines.append(lines[idx])
idx += 1 idx += 1
elif re.search(fr'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None: elif re.search(rf'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None:
block = [lines[idx]] block = [lines[idx]]
indent = find_indent(lines[idx]) indent = find_indent(lines[idx])
idx += 1 idx += 1
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# 2. run `make deps_table_update`` # 2. run `make deps_table_update``
deps = { deps = {
"Pillow": "Pillow", "Pillow": "Pillow",
"black": "black==21.4b0", "black": "black~=22.0",
"codecarbon": "codecarbon==1.2.0", "codecarbon": "codecarbon==1.2.0",
"cookiecutter": "cookiecutter==1.7.2", "cookiecutter": "cookiecutter==1.7.2",
"dataclasses": "dataclasses", "dataclasses": "dataclasses",
......
...@@ -392,6 +392,6 @@ class BeamHypotheses: ...@@ -392,6 +392,6 @@ class BeamHypotheses:
elif self.early_stopping: elif self.early_stopping:
return True return True
else: else:
cur_score = best_sum_logprobs / cur_len ** self.length_penalty cur_score = best_sum_logprobs / cur_len**self.length_penalty
ret = self.worst_score >= cur_score ret = self.worst_score >= cur_score
return ret return ret
...@@ -679,7 +679,7 @@ class FlaxGenerationMixin: ...@@ -679,7 +679,7 @@ class FlaxGenerationMixin:
not_max_length_yet = state.cur_len < max_length not_max_length_yet = state.cur_len < max_length
# 2. can the new beams still improve? # 2. can the new beams still improve?
best_running_score = state.running_scores[:, -1:] / (max_length ** length_penalty) best_running_score = state.running_scores[:, -1:] / (max_length**length_penalty)
worst_finished_score = jnp.where( worst_finished_score = jnp.where(
state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7) state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7)
) )
...@@ -769,7 +769,7 @@ class FlaxGenerationMixin: ...@@ -769,7 +769,7 @@ class FlaxGenerationMixin:
# - add length penalty # - add length penalty
# - make sure no scores can be added anymore if beam is full # - make sure no scores can be added anymore if beam is full
# - make sure still running sequences cannot be chosen as finalized beam # - make sure still running sequences cannot be chosen as finalized beam
topk_log_probs = topk_log_probs / (state.cur_len ** length_penalty) topk_log_probs = topk_log_probs / (state.cur_len**length_penalty)
beams_in_batch_are_full = ( beams_in_batch_are_full = (
jnp.broadcast_to(state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape) jnp.broadcast_to(state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape)
& early_stopping & early_stopping
......
...@@ -1694,6 +1694,6 @@ class BeamHypotheses(object): ...@@ -1694,6 +1694,6 @@ class BeamHypotheses(object):
elif self.early_stopping: elif self.early_stopping:
return True return True
else: else:
cur_score = best_sum_logprobs / cur_len ** self.length_penalty cur_score = best_sum_logprobs / cur_len**self.length_penalty
ret = self.worst_score >= cur_score ret = self.worst_score >= cur_score
return ret return ret
...@@ -1827,7 +1827,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -1827,7 +1827,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
def build(self, input_shape): def build(self, input_shape):
""" """
......
...@@ -146,7 +146,7 @@ class BartAttention(nn.Module): ...@@ -146,7 +146,7 @@ class BartAttention(nn.Module):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
...@@ -152,7 +152,7 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -152,7 +152,7 @@ class TFBartAttention(tf.keras.layers.Layer):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})." f" and `num_heads`: {num_heads})."
) )
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
......
...@@ -75,10 +75,10 @@ def bytes_to_unicode(): ...@@ -75,10 +75,10 @@ def bytes_to_unicode():
) )
cs = bs[:] cs = bs[:]
n = 0 n = 0
for b in range(2 ** 8): for b in range(2**8):
if b not in bs: if b not in bs:
bs.append(b) bs.append(b)
cs.append(2 ** 8 + n) cs.append(2**8 + n)
n += 1 n += 1
cs = [chr(n) for n in cs] cs = [chr(n) for n in cs]
return dict(zip(bs, cs)) return dict(zip(bs, cs))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment