Enforce string-formatting with f-strings (#10980)

* First third * Styling and fix mistake * Quality * All the rest * Treat %s and %d * typo * Missing ) * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

Enforce string-formatting with f-strings (#10980)
* First third * Styling and fix mistake * Quality * All the rest * Treat %s and %d * typo * Missing ) * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
acc3bd9d · Sylvain Gugger · GitHub · d0b3797a · acc3bd9d · acc3bd9d
Unverified Commit acc3bd9d authored Mar 31, 2021 by Sylvain Gugger Committed by GitHub Mar 31, 2021
20 changed files
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -133,20 +133,20 @@ class LegacyIndex(Index):
            )
            raise EnvironmentError(msg)
        if resolved_archive_file == archive_file:
-            logger.info("loading file {}".format(archive_file))
+            logger.info(f"loading file {archive_file}")
        else:
-            logger.info("loading file {} from cache at {}".format(archive_file, resolved_archive_file))
+            logger.info(f"loading file {archive_file} from cache at {resolved_archive_file}")
        return resolved_archive_file
    def _load_passages(self):
-        logger.info("Loading passages from {}".format(self.index_path))
+        logger.info(f"Loading passages from {self.index_path}")
        passages_path = self._resolve_path(self.index_path, self.PASSAGE_FILENAME)
        with open(passages_path, "rb") as passages_file:
            passages = pickle.load(passages_file)
        return passages
    def _deserialize_index(self):
-        logger.info("Loading index from {}".format(self.index_path))
+        logger.info(f"Loading index from {self.index_path}")
        resolved_index_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index.dpr")
        self.index = faiss.read_index(resolved_index_path)
        resolved_meta_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index_meta.dpr")
@@ -200,12 +200,12 @@ class HFIndexBase(Index):
    def _check_dataset_format(self, with_index: bool):
        if not isinstance(self.dataset, Dataset):
-            raise ValueError("Dataset should be a datasets.Dataset object, but got {}".format(type(self.dataset)))
+            raise ValueError(f"Dataset should be a datasets.Dataset object, but got {type(self.dataset)}")
        if len({"title", "text", "embeddings"} - set(self.dataset.column_names)) > 0:
            raise ValueError(
                "Dataset should be a dataset with the following columns: "
                "title (str), text (str) and embeddings (arrays of dimension vector_size), "
-                "but got columns {}".format(self.dataset.column_names)
+                f"but got columns {self.dataset.column_names}"
            )
        if with_index and "embeddings" not in self.dataset.list_indexes():
            raise ValueError(
@@ -269,7 +269,7 @@ class CanonicalHFIndex(HFIndexBase):
        self.index_name = index_name
        self.index_path = index_path
        self.use_dummy_dataset = use_dummy_dataset
-        logger.info("Loading passages from {}".format(self.dataset_name))
+        logger.info(f"Loading passages from {self.dataset_name}")
        dataset = load_dataset(
            self.dataset_name, with_index=False, split=self.dataset_split, dummy=self.use_dummy_dataset
        )
@@ -277,10 +277,10 @@ class CanonicalHFIndex(HFIndexBase):
    def init_index(self):
        if self.index_path is not None:
-            logger.info("Loading index from {}".format(self.index_path))
+            logger.info(f"Loading index from {self.index_path}")
            self.dataset.load_faiss_index("embeddings", file=self.index_path)
        else:
-            logger.info("Loading index from {}".format(self.dataset_name + " with index name " + self.index_name))
+            logger.info(f"Loading index from {self.dataset_name} with index name {self.index_name}")
            self.dataset = load_dataset(
                self.dataset_name,
                with_embeddings=True,
@@ -313,7 +313,7 @@ class CustomHFIndex(HFIndexBase):
    @classmethod
    def load_from_disk(cls, vector_size, dataset_path, index_path):
-        logger.info("Loading passages from {}".format(dataset_path))
+        logger.info(f"Loading passages from {dataset_path}")
        if dataset_path is None or index_path is None:
            raise ValueError(
                "Please provide ``dataset_path`` and ``index_path`` after calling ``dataset.save_to_disk(dataset_path)`` "
@@ -324,7 +324,7 @@ class CustomHFIndex(HFIndexBase):
    def init_index(self):
        if not self.is_initialized():
-            logger.info("Loading index from {}".format(self.index_path))
+            logger.info(f"Loading index from {self.index_path}")
            self.dataset.load_faiss_index("embeddings", file=self.index_path)
            self._index_initialized = True
@@ -520,9 +520,7 @@ class RagRetriever:
            start_time = time.time()
            ids, vectors = self.index.get_top_docs(question_hidden_states, n_docs)
            logger.debug(
-                "index search time: {} sec, batch size {}".format(
+                f"index search time: {time.time() - start_time} sec, batch size {question_hidden_states.shape}"
-                    time.time() - start_time, question_hidden_states.shape
-                )
            )
            ids_batched.extend(ids)
            vectors_batched.extend(vectors)

--- a/src/transformers/models/rag/tokenization_rag.py
+++ b/src/transformers/models/rag/tokenization_rag.py
@@ -34,7 +34,7 @@ class RagTokenizer:
    def save_pretrained(self, save_directory):
        if os.path.isfile(save_directory):
-            raise ValueError("Provided path ({}) should be a directory, not a file".format(save_directory))
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
        os.makedirs(save_directory, exist_ok=True)
        question_encoder_path = os.path.join(save_directory, "question_encoder_tokenizer")
        generator_path = os.path.join(save_directory, "generator_tokenizer")

--- a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
@@ -30,10 +30,10 @@ logging.set_verbosity_info()
 def set_param(torch_layer, weight, bias=None):
    # set parameter of one layer
-    assert torch_layer.weight.shape == weight.shape, "{} layer.weight does not match".format(torch_layer)
+    assert torch_layer.weight.shape == weight.shape, f"{torch_layer} layer.weight does not match"
    torch_layer.weight = torch.nn.Parameter(weight)
    if bias is not None:
-        assert torch_layer.bias.shape == bias.shape, "{} layer.bias does not match".format(torch_layer)
+        assert torch_layer.bias.shape == bias.shape, f"{torch_layer} layer.bias does not match"
        torch_layer.bias = torch.nn.Parameter(bias)
@@ -150,9 +150,9 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
        position_embeddings = torch_model_reformer.embeddings.position_embeddings
        for emb_idx in range(len(position_embeddings.weights)):
            emb_weights = np.asarray(weights[3][emb_idx][0])
-            assert position_embeddings.weights[emb_idx].shape == emb_weights.shape, "{} emb does not match".format(
+            assert (
-                position_embeddings[emb_idx]
+                position_embeddings.weights[emb_idx].shape == emb_weights.shape
-            )
+            ), f"{position_embeddings[emb_idx]} emb does not match"
            position_embeddings.weights[emb_idx] = torch.nn.Parameter(torch.tensor(emb_weights))
    trax_layer_weights = weights[5]
@@ -185,7 +185,7 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
 def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = ReformerConfig.from_json_file(config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
+    print(f"Building PyTorch model from configuration: {config}")
    model = ReformerModelWithLMHead(config)
    with open(trax_model_pkl_path, "rb") as f:
@@ -194,7 +194,7 @@ def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch
    set_model_weights_in_torch(model_weights, model, config.hidden_size)
    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    print(f"Save PyTorch model to {pytorch_dump_path}")
    torch.save(model.state_dict(), pytorch_dump_path)

--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -90,9 +90,8 @@ def _get_least_common_mult_chunk_len(config):
        return np.lcm(config.lsh_attn_chunk_length, config.local_attn_chunk_length)
    else:
        raise NotImplementedError(
-            "Only attn layer types 'lsh' and 'local' exist, but `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.".format(
+            f"Only attn layer types 'lsh' and 'local' exist, but `config.attn_layers`: {config.attn_layers}. Select "
-                config.attn_layers
+            "attn layer types from ['lsh', 'local'] only."
-            )
        )
@@ -107,9 +106,8 @@ def _get_min_chunk_len(config):
        return min(config.lsh_attn_chunk_length, config.local_attn_chunk_length)
    else:
        raise NotImplementedError(
-            "Only attn layer types 'lsh' and 'local' exist, but `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.".format(
+            f"Only attn layer types 'lsh' and 'local' exist, but `config.attn_layers`: {config.attn_layers}. Select "
-                config.attn_layers
+            "attn layer types from ['lsh', 'local'] only."
-            )
        )
@@ -127,11 +125,11 @@ class AxialPositionEmbeddings(nn.Module):
        self.least_common_mult_chunk_length = _get_least_common_mult_chunk_len(config)
        self.weights = nn.ParameterList()
-        assert (
+        if sum(self.axial_pos_embds_dim) != config.hidden_size:
-            sum(self.axial_pos_embds_dim) == config.hidden_size
+            raise ValueError(
-        ), "Make sure that config.axial_pos_embds factors: {} sum to config.hidden_size: {}".format(
+                f"Make sure that config.axial_pos_embds factors: {self.axial_pos_embds_dim} sum to "
-            self.axial_pos_embds_dim, config.hidden_size
+                f"config.hidden_size: {config.hidden_size}"
-        )
+            )
        # create weights
        for axis, axial_pos_embd_dim in enumerate(self.axial_pos_embds_dim):
@@ -153,11 +151,14 @@ class AxialPositionEmbeddings(nn.Module):
        ]
        if self.training is True:
-            assert (
+            if reduce(mul, self.axial_pos_shape) != sequence_length:
-                reduce(mul, self.axial_pos_shape) == sequence_length
+                raise ValueError(
-            ), "If training, make sure that config.axial_pos_shape factors: {} multiply to sequence length. Got prod({}) != sequence_length: {}. You might want to consider padding your sequence length to {} or changing config.axial_pos_shape.".format(
+                    f"If training, make sure that config.axial_pos_shape factors: {self.axial_pos_shape} multiply to "
-                self.axial_pos_shape, self.axial_pos_shape, sequence_length, reduce(mul, self.axial_pos_shape)
+                    f"sequence length. Got prod({self.axial_pos_shape}) != sequence_length: {sequence_length}. "
-            )
+                    f"You might want to consider padding your sequence length to {reduce(mul, self.axial_pos_shape)} "
+                    "or changing config.axial_pos_shape."
+                )
            if self.dropout > 0:
                weights = torch.cat(broadcasted_weights, dim=-1)
                # permute weights so that 2D correctly drops dims 1 and 2
@@ -177,13 +178,12 @@ class AxialPositionEmbeddings(nn.Module):
                )
        else:
-            assert (
+            if reduce(mul, self.axial_pos_shape) < sequence_length:
-                reduce(mul, self.axial_pos_shape) >= sequence_length
+                raise ValueError(
-            ), "Make sure that config.axial_pos_shape factors: {} multiply at least to max(sequence_length, least_common_mult_chunk_length): max({}, {})".format(
+                    f"Make sure that config.axial_pos_shape factors: {self.axial_pos_shape} multiply at least to "
-                self.axial_pos_shape,
+                    f"max(sequence_length, least_common_mult_chunk_length): max({sequence_length}, "
-                sequence_length,
+                    f"{self.least_common_mult_chunk_length})."
-                self.least_common_mult_chunk_length,
+                )
-            )
            # compute how many columns are needed
            max_position_id = position_ids.max().item()
@@ -252,11 +252,11 @@ class ReformerEmbeddings(nn.Module):
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
-        assert (
+        if position_ids.shape[-1] > self.max_position_embeddings:
-            position_ids.shape[-1] <= self.max_position_embeddings
+            raise ValueError(
-        ), "Sequence Length: {} has to be larger equal than config.max_position_embeddings: {}".format(
+                f"Sequence Length: {position_ids.shape[-1]} has to be larger equal than "
-            position_ids.shape[-1], self.max_position_embeddings
+                f"config.max_position_embeddings {self.max_position_embeddings}."
-        )
+            )
        # dropout
        embeddings = nn.functional.dropout(inputs_embeds, p=self.dropout, training=self.training)
@@ -322,7 +322,7 @@ class EfficientAttentionMixin:
        elif len(vectors.shape) == 3:
            return torch.reshape(vectors, split_dim_shape)
        else:
-            raise ValueError("Input vector rank should be one of [3, 4], but is: {}".format(len(vectors.shape)))
+            raise ValueError(f"Input vector rank should be one of [3, 4], but is: {len(vectors.shape)}")
 class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
@@ -451,14 +451,10 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
        assert (
            query_key_vectors.shape[-1] == self.attention_head_size
-        ), "last dim of query_key_vectors is {} but should be {}.".format(
+        ), f"last dim of query_key_vectors is {query_key_vectors.shape[-1]} but should be {self.attention_head_size}."
-            query_key_vectors.shape[-1], self.attention_head_size
-        )
        assert (
            value_vectors.shape[-1] == self.attention_head_size
-        ), "last dim of value_vectors is {} but should be {}.".format(
+        ), f"last dim of value_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
-            value_vectors.shape[-1], self.attention_head_size
-        )
        do_standard_self_attention = (sequence_length <= self.chunk_length) or (
            use_cache and past_buckets_states[1] is not None
@@ -479,7 +475,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
            assert (
                int(buckets.shape[-1]) == num_hashes * sequence_length
-            ), "last dim of buckets is {}, but should be {}".format(buckets.shape[-1], num_hashes * sequence_length)
+            ), f"last dim of buckets is {buckets.shape[-1]}, but should be {num_hashes * sequence_length}"
            sorted_bucket_idx, undo_sorted_bucket_idx = self._get_sorted_bucket_idx_and_undo_sorted_bucket_idx(
                sequence_length, buckets, num_hashes
@@ -616,16 +612,16 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
        if isinstance(self.num_buckets, int):
            assert (
                self.num_buckets % 2 == 0
-            ), "There should be an even number of bucktes, but `self.num_bucktes`: {}".format(self.num_buckets)
+            ), f"There should be an even number of bucktes, but `self.num_bucktes`: {self.num_buckets}"
            rotation_size = self.num_buckets
            num_buckets = self.num_buckets
        else:
            # Factorize the hash if self.num_buckets is a list or tuple
            rotation_size, num_buckets = 0, 1
            for bucket_factor in self.num_buckets:
-                assert bucket_factor % 2 == 0, "The number of buckets should be even, but `num_bucket`: {}".format(
+                assert (
-                    bucket_factor
+                    bucket_factor % 2 == 0
-                )
+                ), f"The number of buckets should be even, but `num_bucket`: {bucket_factor}"
                rotation_size = rotation_size + bucket_factor
                num_buckets = num_buckets * bucket_factor
@@ -714,7 +710,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
        if num_buckets > num_buckets_limit:
            num_buckets = [2 ** (num_buckets_pow_2 // 2), 2 ** (num_buckets_pow_2 - num_buckets_pow_2 // 2)]
-        logger.warning("config.num_buckets is not set. Setting config.num_buckets to {}...".format(num_buckets))
+        logger.warning(f"config.num_buckets is not set. Setting config.num_buckets to {num_buckets}...")
        # set num buckets in config to be properly saved
        self.config.num_buckets = num_buckets
@@ -1085,19 +1081,13 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
        assert (
            query_vectors.shape[-1] == self.attention_head_size
-        ), "last dim of query_key_vectors is {} but should be {}.".format(
+        ), f"last dim of query_key_vectors is {query_vectors.shape[-1]} but should be {self.attention_head_size}."
-            query_vectors.shape[-1], self.attention_head_size
-        )
        assert (
            key_vectors.shape[-1] == self.attention_head_size
-        ), "last dim of query_key_vectors is {} but should be {}.".format(
+        ), f"last dim of query_key_vectors is {key_vectors.shape[-1]} but should be {self.attention_head_size}."
-            key_vectors.shape[-1], self.attention_head_size
-        )
        assert (
            value_vectors.shape[-1] == self.attention_head_size
-        ), "last dim of query_key_vectors is {} but should be {}.".format(
+        ), f"last dim of query_key_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
-            value_vectors.shape[-1], self.attention_head_size
-        )
        if self.chunk_length is None:
            assert (
@@ -1280,9 +1270,8 @@ class ReformerAttention(nn.Module):
                self.self_attention = LocalSelfAttention(config)
        else:
            raise NotImplementedError(
-                "Only attn layer types 'lsh' and 'local' exist, but got `config.attn_layers`: {}. Select attn layer types from ['lsh', 'local'] only.".format(
+                f"Only attn layer types 'lsh' and 'local' exist, but got `config.attn_layers`: {self.attn_layers}. "
-                    self.attn_layers
+                "Select attn layer types from ['lsh', 'local'] only."
-                )
            )
        self.output = ReformerSelfOutput(config)
@@ -2036,7 +2025,7 @@ class ReformerModel(ReformerPreTrainedModel):
        assert (
            len(input_shape) == 2
-        ), "`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {}".format(input_shape)
+        ), f"`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {input_shape}"
        if past_buckets_states is not None:
            assert not self.training, "`past_buckets_states` can only be used for inference, not for training`."
@@ -2062,9 +2051,9 @@ class ReformerModel(ReformerPreTrainedModel):
            if self.training is True:
                raise ValueError(
-                    "If training, sequence Length {} has to be a multiple of least common multiple chunk_length {}. Please consider padding the input to a length of {}.".format(
+                    f"If training, sequence length {input_shape[-1]} has to be a multiple of least common multiple "
-                        input_shape[-1], least_common_mult_chunk_length, input_shape[-1] + padding_length
+                    f"chunk_length {least_common_mult_chunk_length}. Please consider padding the input to a length "
-                    )
+                    f"of {input_shape[-1] + padding_length}."
                )
            # pad input
@@ -2134,9 +2123,8 @@ class ReformerModel(ReformerPreTrainedModel):
        device=None,
    ):
        logger.info(
-            "Input ids are automatically padded from {} to {} to be a multiple of `config.chunk_length`: {}".format(
+            f"Input ids are automatically padded from {input_shape[-1]} to {input_shape[-1] + padding_length} to be a "
-                input_shape[-1], input_shape[-1] + padding_length, padded_seq_length
+            f"multiple of `config.chunk_length`: {padded_seq_length}"
-            )
        )
        padded_input_ids = torch.full(

--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -131,7 +131,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]

--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -107,7 +107,7 @@ class ReformerTokenizerFast(PreTrainedTokenizerFast):
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]

--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -149,8 +149,8 @@ class RobertaSelfAttention(nn.Module):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+                f"heads ({config.num_attention_heads})"
            )
        self.num_attention_heads = config.num_attention_heads

--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -396,7 +396,7 @@ class TFRobertaEncoder(tf.keras.layers.Layer):
    def __init__(self, config: RobertaConfig, **kwargs):
        super().__init__(**kwargs)
-        self.layer = [TFRobertaLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
+        self.layer = [TFRobertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
    def call(
        self,

--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -172,8 +172,7 @@ class SqueezeBertSelfAttention(nn.Module):
        super().__init__()
        if cin % config.num_attention_heads != 0:
            raise ValueError(
-                "cin (%d) is not a multiple of the number of attention "
+                f"cin ({cin}) is not a multiple of the number of attention heads ({config.num_attention_heads})"
-                "heads (%d)" % (cin, config.num_attention_heads)
            )
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(cin / config.num_attention_heads)

--- a/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -27,14 +27,14 @@ logging.set_verbosity_info()
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = T5Config.from_json_file(config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
+    print(f"Building PyTorch model from configuration: {config}")
    model = T5ForConditionalGeneration(config)
    # Load weights from tf checkpoint
    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -82,13 +82,13 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    tf_weights = {}
    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        tf_weights[name] = array
@@ -101,11 +101,11 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
            for n in name
        ):
-            logger.info("Skipping {}".format("/".join(name)))
+            logger.info(f"Skipping {'/'.join(name)}")
            tf_weights.pop(txt_name, None)
            continue
        if "_slot_" in name[-1]:
-            logger.info("Skipping {}".format("/".join(name)))
+            logger.info(f"Skipping {'/'.join(name)}")
            tf_weights.pop(txt_name, None)
            continue
        pointer = model
@@ -149,7 +149,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
+                    logger.info(f"Skipping {'/'.join(name)}")
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
@@ -157,7 +157,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
        if scope_names[0] not in ["kernel", "scale", "embedding"]:
            pointer = getattr(pointer, "weight")
        if scope_names[0] != "embedding":
-            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
+            logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
            array = np.transpose(array)
        try:
            assert (
@@ -166,11 +166,11 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
+        logger.info(f"Initialize PyTorch weight {name}")
        pointer.data = torch.from_numpy(array.astype(np.float32))
        tf_weights.pop(txt_name, None)
-    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
+    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
    return model
@@ -428,9 +428,7 @@ class T5Attention(nn.Module):
        if past_key_value is not None:
            assert (
                len(past_key_value) == 2
-            ), "past_key_value should have 2 past states: keys and values. Got {} past states".format(
+            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-                len(past_key_value)
-            )
            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -618,12 +616,12 @@ class T5Block(nn.Module):
            assert self.is_decoder, "Only decoder can use `past_key_values`"
            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-            error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format(
+            if len(past_key_value) != expected_num_past_key_values:
-                expected_num_past_key_values,
+                raise ValueError(
-                "2 (past / key) for cross attention" if expected_num_past_key_values == 4 else "",
+                    f"There should be {expected_num_past_key_values} past states. "
-                len(past_key_value),
+                    f"{'2 (past / key) for cross attention' if expected_num_past_key_values == 4 else ''}."
-            )
+                    f"Got {len(past_key_value)} past key / value states"
-            assert len(past_key_value) == expected_num_past_key_values, error_message
+                )
            self_attn_past_key_value = past_key_value[:2]
            cross_attn_past_key_value = past_key_value[2:]
@@ -888,9 +886,7 @@ class T5Stack(T5PreTrainedModel):
        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
        if use_cache is True:
-            assert self.is_decoder, ":obj:`use_cache` can only be set to `True` if {} is used as a decoder".format(
+            assert self.is_decoder, f":obj:`use_cache` can only be set to `True` if {self} is used as a decoder"
-                self
-            )
        if attention_mask is None:
            attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)

--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -273,9 +273,7 @@ class TFT5Attention(tf.keras.layers.Layer):
        if past_key_value is not None:
            assert (
                len(past_key_value) == 2
-            ), "past_key_value should have 2 past states: keys and values. Got {} past states".format(
+            ), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
-                len(past_key_value)
-            )
            real_seq_length += shape_list(past_key_value[0])[2] if query_length is None else query_length
        key_length = real_seq_length if key_value_states is None else shape_list(key_value_states)[1]
@@ -472,7 +470,7 @@ class TFT5Block(tf.keras.layers.Layer):
                )
            )
-        self.layer.append(TFT5LayerFF(config, name="layer_._{}".format(len(self.layer))))
+        self.layer.append(TFT5LayerFF(config, name=f"layer_._{len(self.layer)}"))
    def call(
        self,
@@ -494,12 +492,12 @@ class TFT5Block(tf.keras.layers.Layer):
            assert self.is_decoder, "Only decoder can use `past_key_values`"
            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-            error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format(
+            if len(past_key_value) != expected_num_past_key_values:
-                expected_num_past_key_values,
+                raise ValueError(
-                "2 (past / key) for cross attention" if expected_num_past_key_values == 4 else "",
+                    f"There should be {expected_num_past_key_values} past states. "
-                len(past_key_value),
+                    f"{'2 (past / key) for cross attention' if expected_num_past_key_values == 4 else ''}."
-            )
+                    f"Got {len(past_key_value)} past key / value states"
-            assert len(past_key_value) == expected_num_past_key_values, error_message
+                )
            self_attn_past_key_value = past_key_value[:2]
            cross_attn_past_key_value = past_key_value[2:]
@@ -579,11 +577,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        self.num_hidden_layers = config.num_layers
        self.block = [
-            TFT5Block(
+            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name=f"block_._{i}")
-                config,
-                has_relative_attention_bias=bool(i == 0),
-                name="block_._{}".format(i),
-            )
            for i in range(config.num_layers)
        ]
        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")

--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -104,7 +104,7 @@ class T5Tokenizer(PreTrainedTokenizer):
    ):
        # Add extra_ids to the special token list
        if extra_ids > 0 and additional_special_tokens is None:
-            additional_special_tokens = ["<extra_id_{}>".format(i) for i in range(extra_ids)]
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
        elif extra_ids > 0 and additional_special_tokens is not None:
            # Check that we have the right number of extra_id special tokens
            extra_tokens = len(set(filter(lambda x: bool("extra_id" in x), additional_special_tokens)))
@@ -257,7 +257,7 @@ class T5Tokenizer(PreTrainedTokenizer):
        if index < self.sp_model.get_piece_size():
            token = self.sp_model.IdToPiece(index)
        else:
-            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
+            token = f"<extra_id_{self.vocab_size - 1 - index}>"
        return token
    def convert_tokens_to_string(self, tokens):
@@ -276,7 +276,7 @@ class T5Tokenizer(PreTrainedTokenizer):
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]

--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -115,7 +115,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
    ):
        # Add extra_ids to the special token list
        if extra_ids > 0 and additional_special_tokens is None:
-            additional_special_tokens = ["<extra_id_{}>".format(i) for i in range(extra_ids)]
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
        elif extra_ids > 0 and additional_special_tokens is not None:
            # Check that we have the right number of extra special tokens
            extra_tokens = len(set(filter(lambda x: bool("extra_id_" in x), additional_special_tokens)))
@@ -141,7 +141,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]

--- a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
@@ -82,20 +82,20 @@ def convert_tf_checkpoint_to_pytorch(
    elif task == "INTERMEDIATE_PRETRAINING":
        model = TapasModel(config=config)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
+    print(f"Building PyTorch model from configuration: {config}")
    # Load weights from tf checkpoint
    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)
    # Save pytorch-model (weights and configuration)
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path[:-17])
    # Save tokenizer files
    dir_name = r"C:\Users\niels.rogge\Documents\Python projecten\tensorflow\Tensorflow models\SQA\Base\tapas_sqa_inter_masklm_base_reset"
    tokenizer = TapasTokenizer(vocab_file=dir_name + r"\vocab.txt", model_max_length=512)
-    print("Save tokenizer files to {}".format(pytorch_dump_path))
+    print(f"Save tokenizer files to {pytorch_dump_path}")
    tokenizer.save_pretrained(pytorch_dump_path[:-17])
    print("Used relative position embeddings:", model.config.reset_position_index_per_cell)

--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -142,13 +142,13 @@ def load_tf_weights_in_tapas(model, config, tf_checkpoint_path):
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)
@@ -169,19 +169,19 @@ def load_tf_weights_in_tapas(model, config, tf_checkpoint_path):
            ]
            for n in name
        ):
-            logger.info("Skipping {}".format("/".join(name)))
+            logger.info(f"Skipping {'/'.join(name)}")
            continue
        # in case the model is TapasForSequenceClassification, we skip output_bias and output_weights
        # since these are not used for classification
        if isinstance(model, TapasForSequenceClassification):
            if any(n in ["output_bias", "output_weights"] for n in name):
-                logger.info("Skipping {}".format("/".join(name)))
+                logger.info(f"Skipping {'/'.join(name)}")
                continue
        # in case the model is TapasModel, we skip output_bias, output_weights, output_bias_cls and output_weights_cls
        # since this model does not have MLM and NSP heads
        if isinstance(model, TapasModel):
            if any(n in ["output_bias", "output_weights", "output_bias_cls", "output_weights_cls"] for n in name):
-                logger.info("Skipping {}".format("/".join(name)))
+                logger.info(f"Skipping {'/'.join(name)}")
                continue
        # if first scope name starts with "bert", change it to "tapas"
        if name[0] == "bert":
@@ -223,7 +223,7 @@ def load_tf_weights_in_tapas(model, config, tf_checkpoint_path):
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
+                    logger.info(f"Skipping {'/'.join(name)}")
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
@@ -241,7 +241,7 @@ def load_tf_weights_in_tapas(model, config, tf_checkpoint_path):
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
+        logger.info(f"Initialize PyTorch weight {name}")
        # Added a check to see whether the array is a scalar (because bias terms in Tapas checkpoints can be
        # scalar => should first be converted to numpy arrays)
        if np.isscalar(array):

--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -324,8 +324,8 @@ class TapasTokenizer(PreTrainedTokenizer):
        if not os.path.isfile(vocab_file):
            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
@@ -1208,9 +1208,9 @@ class TapasTokenizer(PreTrainedTokenizer):
        if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose:
            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
                logger.warning(
-                    "Token indices sequence length is longer than the specified maximum sequence length "
+                    f"Token indices sequence length is longer than the specified maximum sequence length "
-                    "for this model ({} > {}). Running this sequence through the model will result in "
+                    f"for this model ({len(encoded_inputs['input_ids'])} > {self.model_max_length}). Running this "
-                    "indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
+                    "sequence through the model will result in indexing errors."
                )
            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
@@ -1670,7 +1670,7 @@ class TapasTokenizer(PreTrainedTokenizer):
    def _find_tokens(self, text, segment):
        """Return start index of segment in text or None."""
-        logging.info("text: %s %s", text, segment)
+        logging.info(f"text: {text} {segment}")
        for index in range(1 + len(text) - len(segment)):
            for seg_index, seg_token in enumerate(segment):
                if text[index + seg_index].piece != seg_token.piece:
@@ -1685,7 +1685,7 @@ class TapasTokenizer(PreTrainedTokenizer):
        answer_text,
    ):
        """Returns all occurrences of answer_text in the table."""
-        logging.info("answer text: %s", answer_text)
+        logging.info(f"answer text: {answer_text}")
        for row_index, row in enumerate(tokenized_table.rows):
            if row_index == 0:
                # We don't search for answers in the header.
@@ -2347,7 +2347,7 @@ _INF = float("INF")
 def _get_numeric_value_from_date(date, mask):
    """Converts date (datetime Python object) to a NumericValue object with a Date object value."""
    if date.year < _MIN_YEAR or date.year > _MAX_YEAR:
-        raise ValueError("Invalid year: %d" % date.year)
+        raise ValueError(f"Invalid year: {date.year}")
    new_date = Date()
    if mask.year:
@@ -2523,7 +2523,7 @@ def _get_value_type(numeric_value):
        return NUMBER_TYPE
    elif numeric_value.date is not None:
        return DATE_TYPE
-    raise ValueError("Unknown type: %s" % numeric_value)
+    raise ValueError(f"Unknown type: {numeric_value}")
 def _get_value_as_primitive_value(numeric_value):
@@ -2541,7 +2541,7 @@ def _get_value_as_primitive_value(numeric_value):
        if date.day is not None:
            value_tuple[2] = float(date.day)
        return tuple(value_tuple)
-    raise ValueError("Unknown type: %s" % numeric_value)
+    raise ValueError(f"Unknown type: {numeric_value}")
 def _get_all_types(numeric_values):
@@ -2567,7 +2567,7 @@ def get_numeric_sort_key_fn(numeric_values):
    """
    value_types = _get_all_types(numeric_values)
    if len(value_types) != 1:
-        raise ValueError("No common value type in %s" % numeric_values)
+        raise ValueError(f"No common value type in {numeric_values}")
    value_type = next(iter(value_types))
    if value_type == NUMBER_TYPE:
@@ -2586,7 +2586,7 @@ def get_numeric_sort_key_fn(numeric_values):
                valid_indexes.discard(tuple_index)
    if not valid_indexes:
-        raise ValueError("No common value in %s" % numeric_values)
+        raise ValueError(f"No common value in {numeric_values}")
    def _sort_key_fn(numeric_value):
        value = _get_value_as_primitive_value(numeric_value)
@@ -2618,8 +2618,7 @@ def _consolidate_numeric_values(row_index_to_values, min_consolidation_fraction,
        return {}
    max_count = max(type_counts.values())
    if max_count < len(row_index_to_values) * min_consolidation_fraction:
-        # logging.log_every_n(logging.INFO, 'Can\'t consolidate types: %s %s %d', 100,
+        # logging.log_every_n(logging.INFO, f'Can\'t consolidate types: {debug_info} {row_index_to_values} {max_count}', 100)
-        #                     debug_info, row_index_to_values, max_count)
        return {}
    valid_types = set()
@@ -2708,15 +2707,13 @@ def filter_invalid_unicode_from_table(table):
            cell, is_invalid = filter_invalid_unicode(cell)
            if is_invalid:
                logging.warning(
-                    "Scrub an invalid table body @ table_id: %s, row_index: %d, " "col_index: %d",
+                    f"Scrub an invalid table body @ table_id: {table.table_id}, row_index: {row_index}, "
-                    table.table_id,
+                    f"col_index: {col_index}",
-                    row_index,
-                    col_index,
                )
    for col_index, column in enumerate(table.columns):
        column, is_invalid = filter_invalid_unicode(column)
        if is_invalid:
-            logging.warning("Scrub an invalid table header @ table_id: %s, col_index: %d", table.table_id, col_index)
+            logging.warning(f"Scrub an invalid table header @ table_id: {table.table_id}, col_index: {col_index}")
 def add_numeric_table_values(table, min_consolidation_fraction=0.7, debug_info=None):

--- a/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -48,14 +48,14 @@ def convert_transfo_xl_checkpoint_to_pytorch(
            corpus = pickle.load(fp, encoding="latin1")
        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
-        print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
+        print(f"Save vocabulary to {pytorch_vocab_dump_path}")
        corpus_vocab_dict = corpus.vocab.__dict__
        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
        corpus_dict_no_vocab = corpus.__dict__
        corpus_dict_no_vocab.pop("vocab", None)
        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
-        print("Save dataset to {}".format(pytorch_dataset_dump_path))
+        print(f"Save dataset to {pytorch_dataset_dump_path}")
        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
    if tf_checkpoint_path:
@@ -63,22 +63,22 @@ def convert_transfo_xl_checkpoint_to_pytorch(
        config_path = os.path.abspath(transfo_xl_config_file)
        tf_path = os.path.abspath(tf_checkpoint_path)
-        print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
+        print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.")
        # Initialise PyTorch model
        if transfo_xl_config_file == "":
            config = TransfoXLConfig()
        else:
            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
-        print("Building PyTorch model from configuration: {}".format(str(config)))
+        print(f"Building PyTorch model from configuration: {config}")
        model = TransfoXLLMHeadModel(config)
        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
        # Save pytorch-model
        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
-        print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
+        print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
        torch.save(model.state_dict(), pytorch_weights_dump_path)
-        print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
+        print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
            f.write(config.to_json_string())

--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -368,7 +368,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
                        r_idx - l_idx,
                        d_emb_i,
                        init_std,
-                        name="emb_layers_._{}".format(i),
+                        name=f"emb_layers_._{i}",
                    )
                )
@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
                    shape=(d_emb_i, self.d_proj),
                    initializer=get_initializer(self.init_std),
                    trainable=True,
-                    name="emb_projs_._{}".format(i),
+                    name=f"emb_projs_._{i}",
                )
            )
@@ -467,7 +467,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
                        layer_norm_epsilon=config.layer_norm_epsilon,
                        init_std=config.init_std,
                        output_attentions=self.output_attentions,
-                        name="layers_._{}".format(i),
+                        name=f"layers_._{i}",
                    )
                )
        else:  # learnable embeddings and absolute embeddings

--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
@@ -59,25 +59,22 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                        shape=(self.d_embed, self.d_proj),
                        initializer="zeros",
                        trainable=True,
-                        name="out_projs_._{}".format(i),
+                        name=f"out_projs_._{i}",
                    )
                    self.out_projs.append(weight)
                else:
                    self.out_projs.append(None)
                weight = self.add_weight(
-                    shape=(
+                    shape=(self.vocab_size, self.d_embed),
-                        self.vocab_size,
-                        self.d_embed,
-                    ),
                    initializer="zeros",
                    trainable=True,
-                    name="out_layers_._{}_._weight".format(i),
+                    name=f"out_layers_._{i}_._weight",
                )
                bias = self.add_weight(
                    shape=(self.vocab_size,),
                    initializer="zeros",
                    trainable=True,
-                    name="out_layers_._{}_._bias".format(i),
+                    name=f"out_layers_._{i}_._bias",
                )
                self.out_layers.append((weight, bias))
        else:
@@ -86,23 +83,20 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                d_emb_i = self.d_embed // (self.div_val ** i)
                weight = self.add_weight(
-                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i)
+                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name=f"out_projs_._{i}"
                )
                self.out_projs.append(weight)
                weight = self.add_weight(
-                    shape=(
+                    shape=(r_idx - l_idx, d_emb_i),
-                        r_idx - l_idx,
-                        d_emb_i,
-                    ),
                    initializer="zeros",
                    trainable=True,
-                    name="out_layers_._{}_._weight".format(i),
+                    name=f"out_layers_._{i}_._weight",
                )
                bias = self.add_weight(
                    shape=(r_idx - l_idx,),
                    initializer="zeros",
                    trainable=True,
-                    name="out_layers_._{}_._bias".format(i),
+                    name=f"out_layers_._{i}_._bias",
                )
                self.out_layers.append((weight, bias))
        super().build(input_shape)