Commit aebd8323 authored by LysandreJik's avatar LysandreJik
Browse files

Update naming + remove f string in run_lm_finetuning example

parent 651bfb7a
...@@ -59,7 +59,7 @@ class TextDataset(Dataset): ...@@ -59,7 +59,7 @@ class TextDataset(Dataset):
def __init__(self, tokenizer, file_path='train', block_size=512): def __init__(self, tokenizer, file_path='train', block_size=512):
assert os.path.isfile(file_path) assert os.path.isfile(file_path)
directory, filename = os.path.split(file_path) directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename)) cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)
if os.path.exists(cached_features_file): if os.path.exists(cached_features_file):
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
...@@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args): ...@@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args):
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, args.mlm_probability) probability_matrix = torch.full(labels.shape, args.mlm_probability)
probability_matrix *= torch.tensor( probability_matrix *= torch.tensor(
[tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()], [tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()],
dtype=torch.float dtype=torch.float
) )
masked_indices = torch.bernoulli(probability_matrix).bool() masked_indices = torch.bernoulli(probability_matrix).bool()
......
...@@ -276,7 +276,7 @@ class CommonTestCases: ...@@ -276,7 +276,7 @@ class CommonTestCases:
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
def test_sequence_ids(self): def test_special_tokens_mask(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
sequence_0 = "Encode this." sequence_0 = "Encode this."
...@@ -286,10 +286,10 @@ class CommonTestCases: ...@@ -286,10 +286,10 @@ class CommonTestCases:
encoded_sequence = tokenizer.encode(sequence_0) encoded_sequence = tokenizer.encode(sequence_0)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
assert len(sequence_ids) == len(encoded_sequence_w_special) assert len(special_tokens_mask) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None] filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence assert encoded_sequence == filtered_sequence
...@@ -297,10 +297,10 @@ class CommonTestCases: ...@@ -297,10 +297,10 @@ class CommonTestCases:
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
assert len(sequence_ids) == len(encoded_sequence_w_special) assert len(special_tokens_mask) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None] filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence assert encoded_sequence == filtered_sequence
...@@ -309,10 +309,10 @@ class CommonTestCases: ...@@ -309,10 +309,10 @@ class CommonTestCases:
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'}) tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids_orig = encoded_sequence_dict["sequence_ids"] special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True) special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True)
assert len(sequence_ids) == len(encoded_sequence_w_special) assert len(special_tokens_mask) == len(encoded_sequence_w_special)
assert sequence_ids_orig == sequence_ids assert special_tokens_mask_orig == special_tokens_mask
...@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
......
...@@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
......
...@@ -820,7 +820,7 @@ class PreTrainedTokenizer(object): ...@@ -820,7 +820,7 @@ class PreTrainedTokenizer(object):
{ {
input_ids: list[int], input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None overflowing_tokens: list[int] if a ``max_length`` is specified, else None
sequence_ids: list[int] if ``add_special_tokens`` if set to ``True`` special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
} }
With the fields: With the fields:
...@@ -828,7 +828,7 @@ class PreTrainedTokenizer(object): ...@@ -828,7 +828,7 @@ class PreTrainedTokenizer(object):
``overflowing_tokens``: list of overflowing tokens if a max length is specified. ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens. tokens and 1 specifying sequence tokens.
""" """
pair = bool(pair_ids is not None) pair = bool(pair_ids is not None)
...@@ -857,7 +857,7 @@ class PreTrainedTokenizer(object): ...@@ -857,7 +857,7 @@ class PreTrainedTokenizer(object):
if add_special_tokens: if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids) encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
...@@ -877,6 +877,7 @@ class PreTrainedTokenizer(object): ...@@ -877,6 +877,7 @@ class PreTrainedTokenizer(object):
if max_length and len(encoded_inputs["input_ids"]) > max_length: if max_length and len(encoded_inputs["input_ids"]) > max_length:
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
return encoded_inputs return encoded_inputs
...@@ -892,7 +893,7 @@ class PreTrainedTokenizer(object): ...@@ -892,7 +893,7 @@ class PreTrainedTokenizer(object):
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
return token_ids_0 + token_ids_1 return token_ids_0 + token_ids_1
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
......
...@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
......
...@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment