Unverified Commit acc3bd9d authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Enforce string-formatting with f-strings (#10980)



* First third

* Styling and fix mistake

* Quality

* All the rest

* Treat %s and %d

* typo

* Missing )

* Apply suggestions from code review
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
parent d0b3797a
...@@ -205,13 +205,13 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path): ...@@ -205,13 +205,13 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path):
) )
raise raise
tf_path = os.path.abspath(tf_checkpoint_path) tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
names = [] names = []
arrays = [] arrays = []
for name, shape in init_vars: for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape)) logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
names.append(name) names.append(name)
arrays.append(array) arrays.append(array)
...@@ -231,7 +231,7 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path): ...@@ -231,7 +231,7 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path):
] ]
for n in name for n in name
): ):
logger.info("Skipping {}".format("/".join(name))) logger.info(f"Skipping {'/'.join(name)}")
continue continue
pointer = model pointer = model
for m_name in name: for m_name in name:
...@@ -251,7 +251,7 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path): ...@@ -251,7 +251,7 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path):
try: try:
pointer = getattr(pointer, scope_names[0]) pointer = getattr(pointer, scope_names[0])
except AttributeError: except AttributeError:
logger.info("Skipping {}".format("/".join(name))) logger.info(f"Skipping {'/'.join(name)}")
continue continue
if len(scope_names) >= 2: if len(scope_names) >= 2:
num = int(scope_names[1]) num = int(scope_names[1])
...@@ -265,7 +265,7 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path): ...@@ -265,7 +265,7 @@ def load_tf_weights_in_lxmert(model, config, tf_checkpoint_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
logger.info("Initialize PyTorch weight {}".format(name)) logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
...@@ -315,8 +315,8 @@ class LxmertAttention(nn.Module): ...@@ -315,8 +315,8 @@ class LxmertAttention(nn.Module):
super().__init__() super().__init__()
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads) f"heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
......
...@@ -249,8 +249,8 @@ class TFLxmertAttention(tf.keras.layers.Layer): ...@@ -249,8 +249,8 @@ class TFLxmertAttention(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads) f"heads ({config.num_attention_heads}"
) )
self.num_attention_heads = config.num_attention_heads self.num_attention_heads = config.num_attention_heads
...@@ -547,9 +547,9 @@ class TFLxmertEncoder(tf.keras.layers.Layer): ...@@ -547,9 +547,9 @@ class TFLxmertEncoder(tf.keras.layers.Layer):
# Layers # Layers
# Using self.layer instead of self.l_layer to support loading BERT weights. # Using self.layer instead of self.l_layer to support loading BERT weights.
self.layer = [TFLxmertLayer(config, name="layer_._{}".format(i)) for i in range(self.num_l_layers)] self.layer = [TFLxmertLayer(config, name=f"layer_._{i}") for i in range(self.num_l_layers)]
self.x_layers = [TFLxmertXLayer(config, name="x_layers_._{}".format(i)) for i in range(self.num_x_layers)] self.x_layers = [TFLxmertXLayer(config, name=f"x_layers_._{i}") for i in range(self.num_x_layers)]
self.r_layers = [TFLxmertLayer(config, name="r_layers_._{}".format(i)) for i in range(self.num_r_layers)] self.r_layers = [TFLxmertLayer(config, name=f"r_layers_._{i}") for i in range(self.num_r_layers)]
self.config = config self.config = config
def call( def call(
......
...@@ -210,7 +210,7 @@ class MBart50Tokenizer(PreTrainedTokenizer): ...@@ -210,7 +210,7 @@ class MBart50Tokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -275,7 +275,7 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): ...@@ -275,7 +275,7 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -26,12 +26,12 @@ logging.set_verbosity_info() ...@@ -26,12 +26,12 @@ logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path): def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path):
# Initialise PyTorch model # Initialise PyTorch model
config = MobileBertConfig.from_json_file(mobilebert_config_file) config = MobileBertConfig.from_json_file(mobilebert_config_file)
print("Building PyTorch model from configuration: {}".format(str(config))) print(f"Building PyTorch model from configuration: {config}")
model = MobileBertForPreTraining(config) model = MobileBertForPreTraining(config)
# Load weights from tf checkpoint # Load weights from tf checkpoint
model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path) model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path)
# Save pytorch-model # Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path)) print(f"Save PyTorch model to {pytorch_dump_path}")
torch.save(model.state_dict(), pytorch_dump_path) torch.save(model.state_dict(), pytorch_dump_path)
......
...@@ -77,13 +77,13 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path): ...@@ -77,13 +77,13 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
) )
raise raise
tf_path = os.path.abspath(tf_checkpoint_path) tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
# Load weights from TF model # Load weights from TF model
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
names = [] names = []
arrays = [] arrays = []
for name, shape in init_vars: for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape)) logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
names.append(name) names.append(name)
arrays.append(array) arrays.append(array)
...@@ -100,7 +100,7 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path): ...@@ -100,7 +100,7 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name for n in name
): ):
logger.info("Skipping {}".format("/".join(name))) logger.info(f"Skipping {'/'.join(name)}")
continue continue
pointer = model pointer = model
for m_name in name: for m_name in name:
...@@ -120,7 +120,7 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path): ...@@ -120,7 +120,7 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
try: try:
pointer = getattr(pointer, scope_names[0]) pointer = getattr(pointer, scope_names[0])
except AttributeError: except AttributeError:
logger.info("Skipping {}".format("/".join(name))) logger.info(f"Skipping {'/'.join(name)}")
continue continue
if len(scope_names) >= 2: if len(scope_names) >= 2:
num = int(scope_names[1]) num = int(scope_names[1])
...@@ -136,7 +136,7 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path): ...@@ -136,7 +136,7 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
logger.info("Initialize PyTorch weight {}".format(name)) logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -210,8 +210,8 @@ class TFMobileBertSelfAttention(tf.keras.layers.Layer): ...@@ -210,8 +210,8 @@ class TFMobileBertSelfAttention(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads) f"heads ({config.num_attention_heads}"
) )
self.num_attention_heads = config.num_attention_heads self.num_attention_heads = config.num_attention_heads
...@@ -463,9 +463,7 @@ class TFMobileBertLayer(tf.keras.layers.Layer): ...@@ -463,9 +463,7 @@ class TFMobileBertLayer(tf.keras.layers.Layer):
if self.use_bottleneck: if self.use_bottleneck:
self.bottleneck = TFBottleneck(config, name="bottleneck") self.bottleneck = TFBottleneck(config, name="bottleneck")
if config.num_feedforward_networks > 1: if config.num_feedforward_networks > 1:
self.ffn = [ self.ffn = [TFFFNLayer(config, name=f"ffn.{i}") for i in range(config.num_feedforward_networks - 1)]
TFFFNLayer(config, name="ffn.{}".format(i)) for i in range(config.num_feedforward_networks - 1)
]
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
if self.use_bottleneck: if self.use_bottleneck:
...@@ -518,7 +516,7 @@ class TFMobileBertEncoder(tf.keras.layers.Layer): ...@@ -518,7 +516,7 @@ class TFMobileBertEncoder(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFMobileBertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
def call( def call(
self, self,
......
...@@ -134,8 +134,8 @@ class MPNetSelfAttention(nn.Module): ...@@ -134,8 +134,8 @@ class MPNetSelfAttention(nn.Module):
super().__init__() super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads) f"heads ({config.num_attention_heads})"
) )
self.num_attention_heads = config.num_attention_heads self.num_attention_heads = config.num_attention_heads
......
...@@ -192,8 +192,8 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): ...@@ -192,8 +192,8 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
if config.hidden_size % config.num_attention_heads != 0: if config.hidden_size % config.num_attention_heads != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads) f"heads ({config.num_attention_heads}"
) )
self.num_attention_heads = config.num_attention_heads self.num_attention_heads = config.num_attention_heads
...@@ -352,7 +352,7 @@ class TFMPNetEncoder(tf.keras.layers.Layer): ...@@ -352,7 +352,7 @@ class TFMPNetEncoder(tf.keras.layers.Layer):
self.relative_attention_num_buckets = config.relative_attention_num_buckets self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.layer = [TFMPNetLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFMPNetLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
self.relative_attention_num_buckets = config.relative_attention_num_buckets self.relative_attention_num_buckets = config.relative_attention_num_buckets
def build(self, input_shape): def build(self, input_shape):
......
...@@ -169,8 +169,8 @@ class MPNetTokenizer(PreTrainedTokenizer): ...@@ -169,8 +169,8 @@ class MPNetTokenizer(PreTrainedTokenizer):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
raise ValueError( raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
) )
self.vocab = load_vocab(vocab_file) self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
...@@ -312,8 +312,8 @@ class MPNetTokenizer(PreTrainedTokenizer): ...@@ -312,8 +312,8 @@ class MPNetTokenizer(PreTrainedTokenizer):
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index: if index != token_index:
logger.warning( logger.warning(
"Saving vocabulary to {}: vocabulary indices are not consecutive." f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file) " Please check that the vocabulary is not corrupted!"
) )
index = token_index index = token_index
writer.write(token + "\n") writer.write(token + "\n")
......
...@@ -41,9 +41,9 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c ...@@ -41,9 +41,9 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
# Save pytorch-model # Save pytorch-model
pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) print(f"Save PyTorch model to {pytorch_weights_dump_path}")
torch.save(model.state_dict(), pytorch_weights_dump_path) torch.save(model.state_dict(), pytorch_weights_dump_path)
print("Save configuration file to {}".format(pytorch_config_dump_path)) print(f"Save configuration file to {pytorch_config_dump_path}")
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(config.to_json_string()) f.write(config.to_json_string())
......
...@@ -67,14 +67,14 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): ...@@ -67,14 +67,14 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
if ".ckpt" in openai_checkpoint_folder_path: if ".ckpt" in openai_checkpoint_folder_path:
openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path) openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path)) logger.info(f"Loading weights from {openai_checkpoint_folder_path}")
with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle: with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle:
names = json.load(names_handle) names = json.load(names_handle)
with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle: with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle:
shapes = json.load(shapes_handle) shapes = json.load(shapes_handle)
offsets = np.cumsum([np.prod(shape) for shape in shapes]) offsets = np.cumsum([np.prod(shape) for shape in shapes])
init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)] init_params = [np.load(openai_checkpoint_folder_path + f"/params_{n}.npy") for n in range(10)]
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
...@@ -134,7 +134,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): ...@@ -134,7 +134,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
logger.info("Initialize PyTorch weight {}".format(name)) logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
return model return model
......
...@@ -210,7 +210,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -210,7 +210,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed" config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
) )
self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
def build(self, input_shape): def build(self, input_shape):
with tf.name_scope("positions_embed"): with tf.name_scope("positions_embed"):
......
...@@ -205,7 +205,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -205,7 +205,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
vocab_file = os.path.join( vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
...@@ -223,8 +223,8 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -223,8 +223,8 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index: if index != token_index:
logger.warning( logger.warning(
"Saving vocabulary to {}: BPE merge indices are not consecutive." f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file) " Please check that the tokenizer is not corrupted!"
) )
index = token_index index = token_index
writer.write(" ".join(bpe_tokens) + "\n") writer.write(" ".join(bpe_tokens) + "\n")
......
...@@ -250,7 +250,7 @@ class PegasusTokenizer(PreTrainedTokenizer): ...@@ -250,7 +250,7 @@ class PegasusTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -191,7 +191,7 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast): ...@@ -191,7 +191,7 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -312,7 +312,7 @@ class PhobertTokenizer(PreTrainedTokenizer): ...@@ -312,7 +312,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
...@@ -346,7 +346,7 @@ class PhobertTokenizer(PreTrainedTokenizer): ...@@ -346,7 +346,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
except FileNotFoundError as fnfe: except FileNotFoundError as fnfe:
raise fnfe raise fnfe
except UnicodeError: except UnicodeError:
raise Exception("Incorrect encoding detected in {}, please " "rebuild the dataset".format(f)) raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
return return
lines = f.readlines() lines = f.readlines()
......
...@@ -135,8 +135,8 @@ class ProphetNetTokenizer(PreTrainedTokenizer): ...@@ -135,8 +135,8 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
raise ValueError( raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = ProphetNetTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) "model use `tokenizer = ProphetNetTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
) )
self.vocab = load_vocab(vocab_file) self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
...@@ -255,8 +255,8 @@ class ProphetNetTokenizer(PreTrainedTokenizer): ...@@ -255,8 +255,8 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index: if index != token_index:
logger.warning( logger.warning(
"Saving vocabulary to {}: vocabulary indices are not consecutive." f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file) " Please check that the vocabulary is not corrupted!"
) )
index = token_index index = token_index
writer.write(token + "\n") writer.write(token + "\n")
......
...@@ -494,9 +494,7 @@ class RagModel(RagPreTrainedModel): ...@@ -494,9 +494,7 @@ class RagModel(RagPreTrainedModel):
question_encoder.config, generator.config, **kwargs question_encoder.config, generator.config, **kwargs
) )
else: else:
assert isinstance(config, self.config_class), "config: {} has to be of type {}".format( assert isinstance(config, self.config_class), f"config: {config} has to be of type {self.config_class}"
config, self.config_class
)
super().__init__(config) super().__init__(config)
if question_encoder is None: if question_encoder is None:
from ..auto.modeling_auto import AutoModel from ..auto.modeling_auto import AutoModel
......
...@@ -496,9 +496,7 @@ class TFRagModel(TFRagPreTrainedModel): ...@@ -496,9 +496,7 @@ class TFRagModel(TFRagPreTrainedModel):
question_encoder.config, generator.config, **kwargs question_encoder.config, generator.config, **kwargs
) )
else: else:
assert isinstance(config, self.config_class), "config: {} has to be of type {}".format( assert isinstance(config, self.config_class), f"config: {config} has to be of type {self.config_class}"
config, self.config_class
)
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
if question_encoder is None: if question_encoder is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment