Unverified Commit acc3bd9d authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Enforce string-formatting with f-strings (#10980)



* First third

* Styling and fix mistake

* Quality

* All the rest

* Treat %s and %d

* typo

* Missing )

* Apply suggestions from code review
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
parent d0b3797a
...@@ -67,7 +67,7 @@ def build_tf_to_pytorch_map(model, config): ...@@ -67,7 +67,7 @@ def build_tf_to_pytorch_map(model, config):
for i, (out_l, proj_l, tie_proj) in enumerate( for i, (out_l, proj_l, tie_proj) in enumerate(
zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs) zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
): ):
layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i layer_str = f"transformer/adaptive_softmax/cutoff_{i}/"
if config.tie_word_embeddings: if config.tie_word_embeddings:
tf_to_pt_map.update({layer_str + "b": out_l.bias}) tf_to_pt_map.update({layer_str + "b": out_l.bias})
else: else:
...@@ -81,12 +81,12 @@ def build_tf_to_pytorch_map(model, config): ...@@ -81,12 +81,12 @@ def build_tf_to_pytorch_map(model, config):
# Embeddings # Embeddings
for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)): for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
layer_str = "transformer/adaptive_embed/cutoff_%d/" % i layer_str = f"transformer/adaptive_embed/cutoff_{i}/"
tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l}) tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})
# Transformer blocks # Transformer blocks
for i, b in enumerate(model.layers): for i, b in enumerate(model.layers):
layer_str = "transformer/layer_%d/" % i layer_str = f"transformer/layer_{i}/"
tf_to_pt_map.update( tf_to_pt_map.update(
{ {
layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
...@@ -135,7 +135,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -135,7 +135,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
tf_weights = {} tf_weights = {}
for name, shape in init_vars: for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape)) logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array tf_weights[name] = array
...@@ -156,7 +156,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -156,7 +156,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (p_i.shape, arr_i.shape) e.args += (p_i.shape, arr_i.shape)
raise raise
logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) logger.info(f"Initialize PyTorch weight {name} for layer {i}")
p_i.data = torch.from_numpy(arr_i) p_i.data = torch.from_numpy(arr_i)
else: else:
try: try:
...@@ -166,13 +166,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -166,13 +166,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
logger.info("Initialize PyTorch weight {}".format(name)) logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None) tf_weights.pop(name, None)
tf_weights.pop(name + "/Adam", None) tf_weights.pop(name + "/Adam", None)
tf_weights.pop(name + "/Adam_1", None) tf_weights.pop(name + "/Adam_1", None)
logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
return model return model
......
...@@ -198,7 +198,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -198,7 +198,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.never_split = never_split self.never_split = never_split
self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
self.punction_without_space_before_pattern = re.compile(r"[^\s][{}]".format(self.punctuation_symbols)) self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]")
self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern() self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
self.language = language self.language = language
self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
...@@ -235,9 +235,9 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -235,9 +235,9 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
except Exception as e: except Exception as e:
raise ValueError( raise ValueError(
"Unable to parse file {}. Unknown format. " f"Unable to parse file {pretrained_vocab_file}. Unknown format. "
"If you tried to load a model saved through TransfoXLTokenizerFast," "If you tried to load a model saved through TransfoXLTokenizerFast,"
"please note they are not compatible.".format(pretrained_vocab_file) "please note they are not compatible."
) from e ) from e
if vocab_file is not None: if vocab_file is not None:
...@@ -248,20 +248,20 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -248,20 +248,20 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return self.lower_case return self.lower_case
def _compile_space_around_punctuation_pattern(self): def _compile_space_around_punctuation_pattern(self):
look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols) look_ahead_for_special_token = f"(?=[{self.punctuation_symbols}])"
look_ahead_to_match_all_except_space = r"(?=[^\s])" look_ahead_to_match_all_except_space = r"(?=[^\s])"
return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space) return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space)
def count_file(self, path, verbose=False, add_eos=False): def count_file(self, path, verbose=False, add_eos=False):
if verbose: if verbose:
logger.info("counting file {} ...".format(path)) logger.info(f"counting file {path} ...")
assert os.path.exists(path), f"Input file {path} not found" assert os.path.exists(path), f"Input file {path} not found"
sents = [] sents = []
with open(path, "r", encoding="utf-8") as f: with open(path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f): for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
logger.info(" line {}".format(idx)) logger.info(f" line {idx}")
symbols = self.tokenize(line, add_eos=add_eos) symbols = self.tokenize(line, add_eos=add_eos)
self.counter.update(symbols) self.counter.update(symbols)
sents.append(symbols) sents.append(symbols)
...@@ -273,10 +273,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -273,10 +273,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
sents : a list of sentences, each a list of tokenized symbols sents : a list of sentences, each a list of tokenized symbols
""" """
if verbose: if verbose:
logger.info("counting {} sents ...".format(len(sents))) logger.info(f"counting {len(sents)} sents ...")
for idx, symbols in enumerate(sents): for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
logger.info(" line {}".format(idx)) logger.info(f" line {idx}")
self.counter.update(symbols) self.counter.update(symbols)
def _build_from_file(self, vocab_file): def _build_from_file(self, vocab_file):
...@@ -308,11 +308,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -308,11 +308,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def build_vocab(self): def build_vocab(self):
if self.vocab_file: if self.vocab_file:
logger.info("building vocab from {}".format(self.vocab_file)) logger.info(f"building vocab from {self.vocab_file}")
self._build_from_file(self.vocab_file) self._build_from_file(self.vocab_file)
logger.info("final vocab size {}".format(len(self))) logger.info(f"final vocab size {len(self)}")
else: else:
logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size)) logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
self.idx2sym = [] self.idx2sym = []
self.sym2idx = OrderedDict() self.sym2idx = OrderedDict()
...@@ -324,18 +324,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -324,18 +324,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
break break
self.add_symbol(sym) self.add_symbol(sym)
logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter))) logger.info(f"final vocab size {len(self)} from {len(self.counter)} unique tokens")
@torch_only_method @torch_only_method
def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False): def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
if verbose: if verbose:
logger.info("encoding file {} ...".format(path)) logger.info(f"encoding file {path} ...")
assert os.path.exists(path), f"Output file {path} not found" assert os.path.exists(path), f"Output file {path} not found"
encoded = [] encoded = []
with open(path, "r", encoding="utf-8") as f: with open(path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f): for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
logger.info(" line {}".format(idx)) logger.info(f" line {idx}")
symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos) symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
encoded.append(self.convert_to_tensor(symbols)) encoded.append(self.convert_to_tensor(symbols))
...@@ -347,11 +347,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -347,11 +347,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
@torch_only_method @torch_only_method
def encode_sents(self, sents, ordered=False, verbose=False): def encode_sents(self, sents, ordered=False, verbose=False):
if verbose: if verbose:
logger.info("encoding {} sents ...".format(len(sents))) logger.info(f"encoding {len(sents)} sents ...")
encoded = [] encoded = []
for idx, symbols in enumerate(sents): for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0: if verbose and idx > 0 and idx % 500000 == 0:
logger.info(" line {}".format(idx)) logger.info(f" line {idx}")
encoded.append(self.convert_to_tensor(symbols)) encoded.append(self.convert_to_tensor(symbols))
if ordered: if ordered:
...@@ -363,7 +363,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -363,7 +363,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if sym not in self.sym2idx: if sym not in self.sym2idx:
self.idx2sym.append(sym) self.idx2sym.append(sym)
self.sym2idx[sym] = len(self.idx2sym) - 1 self.sym2idx[sym] = len(self.idx2sym) - 1
setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym]) setattr(self, f"{sym.strip('<>')}_idx", self.sym2idx[sym])
def add_symbol(self, sym): def add_symbol(self, sym):
if sym not in self.sym2idx: if sym not in self.sym2idx:
...@@ -430,7 +430,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -430,7 +430,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def _convert_id_to_token(self, idx): def _convert_id_to_token(self, idx):
"""Converts an id in a token (BPE) using the vocab.""" """Converts an id in a token (BPE) using the vocab."""
assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx) assert 0 <= idx < len(self), f"Index {idx} out of vocabulary range"
return self.idx2sym[idx] return self.idx2sym[idx]
def _convert_token_to_id(self, sym): def _convert_token_to_id(self, sym):
...@@ -438,7 +438,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -438,7 +438,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if sym in self.sym2idx: if sym in self.sym2idx:
return self.sym2idx[sym] return self.sym2idx[sym]
else: else:
# logger.info('encounter unk {}'.format(sym)) # logger.info(f'encounter unk {sym}')
# assert '<eos>' not in sym # assert '<eos>' not in sym
if hasattr(self, "unk_idx"): if hasattr(self, "unk_idx"):
return self.sym2idx.get(sym, self.unk_idx) return self.sym2idx.get(sym, self.unk_idx)
...@@ -675,20 +675,16 @@ class TransfoXLCorpus(object): ...@@ -675,20 +675,16 @@ class TransfoXLCorpus(object):
resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir) resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
except EnvironmentError: except EnvironmentError:
logger.error( logger.error(
"Corpus '{}' was not found in corpus list ({}). " f"Corpus '{pretrained_model_name_or_path}' was not found in corpus list "
"We assumed '{}' was a path or url but couldn't find files {} " f"({', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys())}. "
"at this path or url.".format( f"We assumed '{pretrained_model_name_or_path}' was a path or url but couldn't find files {corpus_file} "
pretrained_model_name_or_path, "at this path or url."
", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
corpus_file,
)
) )
return None return None
if resolved_corpus_file == corpus_file: if resolved_corpus_file == corpus_file:
logger.info("loading corpus file {}".format(corpus_file)) logger.info(f"loading corpus file {corpus_file}")
else: else:
logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file)) logger.info(f"loading corpus file {corpus_file} from cache at {resolved_corpus_file}")
# Instantiate tokenizer. # Instantiate tokenizer.
corpus = cls(*inputs, **kwargs) corpus = cls(*inputs, **kwargs)
...@@ -777,7 +773,7 @@ def get_lm_corpus(datadir, dataset): ...@@ -777,7 +773,7 @@ def get_lm_corpus(datadir, dataset):
with open(fn, "rb") as fp: with open(fn, "rb") as fp:
corpus = pickle.load(fp) corpus = pickle.load(fp)
else: else:
logger.info("Producing dataset {}...".format(dataset)) logger.info(f"Producing dataset {dataset}...")
kwargs = {} kwargs = {}
if dataset in ["wt103", "wt2"]: if dataset in ["wt103", "wt2"]:
kwargs["special"] = ["<eos>"] kwargs["special"] = ["<eos>"]
......
...@@ -260,7 +260,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -260,7 +260,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
vocab_file = os.path.join( vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
...@@ -514,7 +514,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): ...@@ -514,7 +514,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
vocab_file = os.path.join( vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -54,14 +54,14 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p ...@@ -54,14 +54,14 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) print(f"Save PyTorch model to {pytorch_weights_dump_path}")
torch.save(two_levels_state_dict, pytorch_weights_dump_path) torch.save(two_levels_state_dict, pytorch_weights_dump_path)
print("Save configuration file to {}".format(pytorch_config_dump_path)) print(f"Save configuration file to {pytorch_config_dump_path}")
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(json.dumps(config, indent=2) + "\n") f.write(json.dumps(config, indent=2) + "\n")
print("Save vocab file to {}".format(pytorch_config_dump_path)) print(f"Save vocab file to {pytorch_config_dump_path}")
with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
f.write(json.dumps(vocab, indent=2) + "\n") f.write(json.dumps(vocab, indent=2) + "\n")
......
...@@ -146,7 +146,7 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer): ...@@ -146,7 +146,7 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer):
else: else:
klen = shape_list(kv)[1] klen = shape_list(kv)[1]
# assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
dim_per_head = self.dim // self.n_heads dim_per_head = self.dim // self.n_heads
mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen) mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
...@@ -289,19 +289,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -289,19 +289,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
for i in range(self.n_layers): for i in range(self.n_layers):
self.attentions.append( self.attentions.append(
TFXLMMultiHeadAttention(self.n_heads, self.dim, config=config, name="attentions_._{}".format(i)) TFXLMMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}")
) )
self.layer_norm1.append( self.layer_norm1.append(
tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1_._{}".format(i)) tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}")
) )
# if self.is_decoder: # if self.is_decoder:
# self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
# self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
self.ffns.append( self.ffns.append(
TFXLMTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name="ffns_._{}".format(i)) TFXLMTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}")
) )
self.layer_norm2.append( self.layer_norm2.append(
tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2_._{}".format(i)) tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}")
) )
if hasattr(config, "pruned_heads"): if hasattr(config, "pruned_heads"):
......
...@@ -153,7 +153,7 @@ class MultiHeadAttention(nn.Module): ...@@ -153,7 +153,7 @@ class MultiHeadAttention(nn.Module):
klen = qlen if cache is None else cache["slen"] + qlen klen = qlen if cache is None else cache["slen"] + qlen
else: else:
klen = kv.size(1) klen = kv.size(1)
# assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
n_heads = self.n_heads n_heads = self.n_heads
dim_per_head = self.dim // n_heads dim_per_head = self.dim // n_heads
mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen) mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
......
...@@ -682,7 +682,7 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -682,7 +682,7 @@ class XLMTokenizer(PreTrainedTokenizer):
import Mykytea import Mykytea
self.ja_word_tokenizer = Mykytea.Mykytea( self.ja_word_tokenizer = Mykytea.Mykytea(
"-model %s/local/share/kytea/model.bin" % os.path.expanduser("~") f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
) )
except (AttributeError, ImportError): except (AttributeError, ImportError):
logger.error( logger.error(
...@@ -954,7 +954,7 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -954,7 +954,7 @@ class XLMTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
vocab_file = os.path.join( vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
...@@ -971,8 +971,8 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -971,8 +971,8 @@ class XLMTokenizer(PreTrainedTokenizer):
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index: if index != token_index:
logger.warning( logger.warning(
"Saving vocabulary to {}: BPE merge indices are not consecutive." f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file) " Please check that the tokenizer is not corrupted!"
) )
index = token_index index = token_index
writer.write(" ".join(bpe_tokens) + "\n") writer.write(" ".join(bpe_tokens) + "\n")
......
...@@ -153,7 +153,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): ...@@ -153,7 +153,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4} self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
for i in range(10): for i in range(10):
tok = "[unused{}]".format(i) tok = f"[unused{i}]"
self.fairseq_tokens_to_ids[tok] = 5 + i self.fairseq_tokens_to_ids[tok] = 5 + i
# The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
...@@ -269,7 +269,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): ...@@ -269,7 +269,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -276,7 +276,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -276,7 +276,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -230,7 +230,7 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): ...@@ -230,7 +230,7 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -55,7 +55,7 @@ def convert_xlnet_checkpoint_to_pytorch( ...@@ -55,7 +55,7 @@ def convert_xlnet_checkpoint_to_pytorch(
finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
if finetuning_task in GLUE_TASKS_NUM_LABELS: if finetuning_task in GLUE_TASKS_NUM_LABELS:
print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) print(f"Building PyTorch XLNetForSequenceClassification model from configuration: {config}")
config.finetuning_task = finetuning_task config.finetuning_task = finetuning_task
config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
model = XLNetForSequenceClassification(config) model = XLNetForSequenceClassification(config)
...@@ -71,9 +71,9 @@ def convert_xlnet_checkpoint_to_pytorch( ...@@ -71,9 +71,9 @@ def convert_xlnet_checkpoint_to_pytorch(
# Save pytorch-model # Save pytorch-model
pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
torch.save(model.state_dict(), pytorch_weights_dump_path) torch.save(model.state_dict(), pytorch_weights_dump_path)
print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(config.to_json_string()) f.write(config.to_json_string())
......
...@@ -69,8 +69,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -69,8 +69,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
if config.d_model % config.n_head != 0: if config.d_model % config.n_head != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
"heads (%d)" % (config.d_model, config.n_head) f"heads ({config.n_head}"
) )
self.n_head = config.n_head self.n_head = config.n_head
...@@ -455,7 +455,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -455,7 +455,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self.word_embedding = TFSharedEmbeddings( self.word_embedding = TFSharedEmbeddings(
config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding" config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
) )
self.layer = [TFXLNetLayer(config, name="layer_._{}".format(i)) for i in range(config.n_layer)] self.layer = [TFXLNetLayer(config, name=f"layer_._{i}") for i in range(config.n_layer)]
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
self.use_mems_eval = config.use_mems_eval self.use_mems_eval = config.use_mems_eval
...@@ -550,7 +550,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -550,7 +550,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
# beg, end = klen - 1, -1 # beg, end = klen - 1, -1
beg, end = klen, -1 beg, end = klen, -1
else: else:
raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) raise ValueError(f"Unknown `attn_type` {self.attn_type}.")
if self.bi_data: if self.bi_data:
fwd_pos_seq = tf.range(beg, end, -1.0) fwd_pos_seq = tf.range(beg, end, -1.0)
...@@ -662,7 +662,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -662,7 +662,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
elif self.attn_type == "bi": elif self.attn_type == "bi":
attn_mask = None attn_mask = None
else: else:
raise ValueError("Unsupported attention type: {}".format(self.attn_type)) raise ValueError(f"Unsupported attention type: {self.attn_type}")
# data mask: input mask & perm mask # data mask: input mask & perm mask
assert inputs["input_mask"] is None or inputs["attention_mask"] is None, ( assert inputs["input_mask"] is None or inputs["attention_mask"] is None, (
......
...@@ -77,10 +77,10 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): ...@@ -77,10 +77,10 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
if ( if (
hasattr(model, "logits_proj") hasattr(model, "logits_proj")
and config.finetuning_task is not None and config.finetuning_task is not None
and "model/regression_{}/logit/kernel".format(config.finetuning_task) in tf_weights and f"model/regression_{config.finetuning_task}/logit/kernel" in tf_weights
): ):
tf_to_pt_map["model/regression_{}/logit/kernel".format(config.finetuning_task)] = model.logits_proj.weight tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/kernel"] = model.logits_proj.weight
tf_to_pt_map["model/regression_{}/logit/bias".format(config.finetuning_task)] = model.logits_proj.bias tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/bias"] = model.logits_proj.bias
# Now load the rest of the transformer # Now load the rest of the transformer
model = model.transformer model = model.transformer
...@@ -95,7 +95,7 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): ...@@ -95,7 +95,7 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
# Transformer blocks # Transformer blocks
for i, b in enumerate(model.layer): for i, b in enumerate(model.layer):
layer_str = "model/transformer/layer_%d/" % i layer_str = f"model/transformer/layer_{i}/"
tf_to_pt_map.update( tf_to_pt_map.update(
{ {
layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight, layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
...@@ -156,7 +156,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -156,7 +156,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
init_vars = tf.train.list_variables(tf_path) init_vars = tf.train.list_variables(tf_path)
tf_weights = {} tf_weights = {}
for name, shape in init_vars: for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape)) logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name) array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array tf_weights[name] = array
...@@ -164,9 +164,9 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -164,9 +164,9 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights) tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
for name, pointer in tf_to_pt_map.items(): for name, pointer in tf_to_pt_map.items():
logger.info("Importing {}".format(name)) logger.info(f"Importing {name}")
if name not in tf_weights: if name not in tf_weights:
logger.info("{} not in tf pre-trained weights, skipping".format(name)) logger.info(f"{name} not in tf pre-trained weights, skipping")
continue continue
array = tf_weights[name] array = tf_weights[name]
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
...@@ -188,7 +188,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -188,7 +188,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (p_i.shape, arr_i.shape) e.args += (p_i.shape, arr_i.shape)
raise raise
logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) logger.info(f"Initialize PyTorch weight {name} for layer {i}")
p_i.data = torch.from_numpy(arr_i) p_i.data = torch.from_numpy(arr_i)
else: else:
try: try:
...@@ -198,13 +198,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path): ...@@ -198,13 +198,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
logger.info("Initialize PyTorch weight {}".format(name)) logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array) pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None) tf_weights.pop(name, None)
tf_weights.pop(name + "/Adam", None) tf_weights.pop(name + "/Adam", None)
tf_weights.pop(name + "/Adam_1", None) tf_weights.pop(name + "/Adam_1", None)
logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
return model return model
...@@ -214,8 +214,8 @@ class XLNetRelativeAttention(nn.Module): ...@@ -214,8 +214,8 @@ class XLNetRelativeAttention(nn.Module):
if config.d_model % config.n_head != 0: if config.d_model % config.n_head != 0:
raise ValueError( raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention " f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
"heads (%d)" % (config.d_model, config.n_head) f"heads ({config.n_head}"
) )
self.n_head = config.n_head self.n_head = config.n_head
...@@ -1041,7 +1041,7 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -1041,7 +1041,7 @@ class XLNetModel(XLNetPreTrainedModel):
# beg, end = klen - 1, -1 # beg, end = klen - 1, -1
beg, end = klen, -1 beg, end = klen, -1
else: else:
raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) raise ValueError(f"Unknown `attn_type` {self.attn_type}.")
if self.bi_data: if self.bi_data:
fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float) fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
...@@ -1145,7 +1145,7 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -1145,7 +1145,7 @@ class XLNetModel(XLNetPreTrainedModel):
elif self.attn_type == "bi": elif self.attn_type == "bi":
attn_mask = None attn_mask = None
else: else:
raise ValueError("Unsupported attention type: {}".format(self.attn_type)) raise ValueError(f"Unsupported attention type: {self.attn_type}")
# data mask: input mask & perm mask # data mask: input mask & perm mask
assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
......
...@@ -314,7 +314,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -314,7 +314,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -254,7 +254,7 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast): ...@@ -254,7 +254,7 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return return
out_vocab_file = os.path.join( out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
......
...@@ -296,13 +296,13 @@ class AdamW(Optimizer): ...@@ -296,13 +296,13 @@ class AdamW(Optimizer):
correct_bias: bool = True, correct_bias: bool = True,
): ):
if lr < 0.0: if lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
if not 0.0 <= betas[0] < 1.0: if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0[")
if not 0.0 <= betas[1] < 1.0: if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0[")
if not 0.0 <= eps: if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
super().__init__(params, defaults) super().__init__(params, defaults)
......
...@@ -333,7 +333,7 @@ class GradientAccumulator(object): ...@@ -333,7 +333,7 @@ class GradientAccumulator(object):
] ]
) )
if len(gradients) != len(self._gradients): if len(gradients) != len(self._gradients):
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) raise ValueError(f"Expected {len(self._gradients)} gradients, but got {len(gradients)}")
for accum_gradient, gradient in zip(self._gradients, gradients): for accum_gradient, gradient in zip(self._gradients, gradients):
if accum_gradient is not None and gradient is not None: if accum_gradient is not None and gradient is not None:
......
...@@ -231,10 +231,10 @@ def check_task(task: str) -> Tuple[Dict, Any]: ...@@ -231,10 +231,10 @@ def check_task(task: str) -> Tuple[Dict, Any]:
if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to": if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
targeted_task = SUPPORTED_TASKS["translation"] targeted_task = SUPPORTED_TASKS["translation"]
return targeted_task, (tokens[1], tokens[3]) return targeted_task, (tokens[1], tokens[3])
raise KeyError("Invalid translation task {}, use 'translation_XX_to_YY' format".format(task)) raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
raise KeyError( raise KeyError(
"Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()) + ["translation_XX_to_YY"]) f"Unknown task {task}, available tasks are {list(SUPPORTED_TASKS.keys()) + ['translation_XX_to_YY']}"
) )
......
...@@ -159,7 +159,7 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option ...@@ -159,7 +159,7 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option
defaults = targeted_task["default"] defaults = targeted_task["default"]
if task_options: if task_options:
if task_options not in defaults: if task_options not in defaults:
raise ValueError("The task does not provide any default models for options {}".format(task_options)) raise ValueError(f"The task does not provide any default models for options {task_options}")
default_models = defaults[task_options]["model"] default_models = defaults[task_options]["model"]
elif "model" in defaults: elif "model" in defaults:
default_models = targeted_task["default"]["model"] default_models = targeted_task["default"]["model"]
...@@ -240,11 +240,11 @@ class PipelineDataFormat: ...@@ -240,11 +240,11 @@ class PipelineDataFormat:
if output_path is not None and not overwrite: if output_path is not None and not overwrite:
if exists(abspath(self.output_path)): if exists(abspath(self.output_path)):
raise OSError("{} already exists on disk".format(self.output_path)) raise OSError(f"{self.output_path} already exists on disk")
if input_path is not None: if input_path is not None:
if not exists(abspath(self.input_path)): if not exists(abspath(self.input_path)):
raise OSError("{} doesnt exist on disk".format(self.input_path)) raise OSError(f"{self.input_path} doesnt exist on disk")
@abstractmethod @abstractmethod
def __iter__(self): def __iter__(self):
...@@ -313,7 +313,7 @@ class PipelineDataFormat: ...@@ -313,7 +313,7 @@ class PipelineDataFormat:
elif format == "pipe": elif format == "pipe":
return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
else: else:
raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format)) raise KeyError(f"Unknown reader {format} (Available reader are json/csv/pipe)")
class CsvPipelineDataFormat(PipelineDataFormat): class CsvPipelineDataFormat(PipelineDataFormat):
...@@ -537,7 +537,7 @@ class Pipeline(_ScikitCompat): ...@@ -537,7 +537,7 @@ class Pipeline(_ScikitCompat):
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.modelcard = modelcard self.modelcard = modelcard
self.framework = framework self.framework = framework
self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device)) self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else f"cuda:{device}")
self.binary_output = binary_output self.binary_output = binary_output
# Special handling # Special handling
...@@ -558,7 +558,7 @@ class Pipeline(_ScikitCompat): ...@@ -558,7 +558,7 @@ class Pipeline(_ScikitCompat):
A path to the directory where to saved. It will be created if it doesn't exist. A path to the directory where to saved. It will be created if it doesn't exist.
""" """
if os.path.isfile(save_directory): if os.path.isfile(save_directory):
logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return return
os.makedirs(save_directory, exist_ok=True) os.makedirs(save_directory, exist_ok=True)
...@@ -596,7 +596,7 @@ class Pipeline(_ScikitCompat): ...@@ -596,7 +596,7 @@ class Pipeline(_ScikitCompat):
output = pipe(...) output = pipe(...)
""" """
if self.framework == "tf": if self.framework == "tf":
with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)): with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
yield yield
else: else:
if self.device.type == "cuda": if self.device.type == "cuda":
......
...@@ -94,15 +94,14 @@ class Conversation: ...@@ -94,15 +94,14 @@ class Conversation:
if self.new_user_input: if self.new_user_input:
if overwrite: if overwrite:
logger.warning( logger.warning(
'User input added while unprocessed input was existing: "{}" was overwritten with: "{}".'.format( f'User input added while unprocessed input was existing: "{self.new_user_input}" was overwritten '
self.new_user_input, text f'with: "{text}".'
)
) )
self.new_user_input = text self.new_user_input = text
else: else:
logger.warning( logger.warning(
'User input added while unprocessed input was existing: "{}" new input ignored: "{}". ' f'User input added while unprocessed input was existing: "{self.new_user_input}" new input '
"Set `overwrite` to True to overwrite unprocessed user input".format(self.new_user_input, text) f'ignored: "{text}". Set `overwrite` to True to overwrite unprocessed user input'
) )
else: else:
self.new_user_input = text self.new_user_input = text
...@@ -148,10 +147,10 @@ class Conversation: ...@@ -148,10 +147,10 @@ class Conversation:
Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
suggestions? bot >> The Big Lebowski suggestions? bot >> The Big Lebowski
""" """
output = "Conversation id: {} \n".format(self.uuid) output = f"Conversation id: {self.uuid} \n"
for is_user, text in self.iter_texts(): for is_user, text in self.iter_texts():
name = "user" if is_user else "bot" name = "user" if is_user else "bot"
output += "{} >> {} \n".format(name, text) output += f"{name} >> {text} \n"
return output return output
...@@ -232,10 +231,8 @@ class ConversationalPipeline(Pipeline): ...@@ -232,10 +231,8 @@ class ConversationalPipeline(Pipeline):
), "ConversationalPipeline expects a Conversation or list of Conversations as an input" ), "ConversationalPipeline expects a Conversation or list of Conversations as an input"
if conversation.new_user_input is None: if conversation.new_user_input is None:
raise ValueError( raise ValueError(
"Conversation with UUID {} does not contain new user input to process. " f"Conversation with UUID {type(conversation.uuid)} does not contain new user input to process. "
"Add user inputs with the conversation's `add_user_input` method".format( "Add user inputs with the conversation's `add_user_input` method"
type(conversation.uuid)
)
) )
assert ( assert (
self.tokenizer.pad_token_id is not None or self.tokenizer.eos_token_id is not None self.tokenizer.pad_token_id is not None or self.tokenizer.eos_token_id is not None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment