"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "17450397a7ab5ed21dce0a8bb7694f9f523a24e8"
Unverified Commit e84470ef authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Merge pull request #1384 from huggingface/encoding-qol

Quality of life enhancements in encoding + patch MLM masking
parents 69629c4f 78ef1a99
...@@ -9,7 +9,7 @@ similar API between the different models. ...@@ -9,7 +9,7 @@ similar API between the different models.
| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. | | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. | | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. | | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. |
| [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
## Language model fine-tuning ## Language model fine-tuning
...@@ -283,17 +283,17 @@ The results are the following: ...@@ -283,17 +283,17 @@ The results are the following:
loss = 0.04755385363816904 loss = 0.04755385363816904
``` ```
##Multiple Choice ## Multiple Choice
Based on the script [`run_multiple_choice.py`](). Based on the script [`run_multiple_choice.py`]().
#### Fine-tuning on SWAG #### Fine-tuning on SWAG
Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
``` ```bash
#training on 4 tesla V100(16GB) GPUS #training on 4 tesla V100(16GB) GPUS
export SWAG_DIR=/path/to/swag_data_dir export SWAG_DIR=/path/to/swag_data_dir
python ./examples/single_model_scripts/run_multiple_choice.py \ python ./examples/run_multiple_choice.py \
--model_type roberta \ --model_type roberta \
--task_name swag \ --task_name swag \
--model_name_or_path roberta-base \ --model_name_or_path roberta-base \
......
...@@ -271,7 +271,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): ...@@ -271,7 +271,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
list(filter(None, args.model_name_or_path.split('/'))).pop(), list(filter(None, args.model_name_or_path.split('/'))).pop(),
str(args.max_seq_length), str(args.max_seq_length),
str(task))) str(task)))
if os.path.exists(cached_features_file): if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file) features = torch.load(cached_features_file)
else: else:
......
...@@ -61,7 +61,7 @@ class TextDataset(Dataset): ...@@ -61,7 +61,7 @@ class TextDataset(Dataset):
def __init__(self, tokenizer, file_path='train', block_size=512): def __init__(self, tokenizer, file_path='train', block_size=512):
assert os.path.isfile(file_path) assert os.path.isfile(file_path)
directory, filename = os.path.split(file_path) directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename)) cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)
if os.path.exists(cached_features_file): if os.path.exists(cached_features_file):
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
...@@ -77,7 +77,7 @@ class TextDataset(Dataset): ...@@ -77,7 +77,7 @@ class TextDataset(Dataset):
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size])) self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
# If your dataset is small, first you should loook for a bigger one :-) and second you # If your dataset is small, first you should loook for a bigger one :-) and second you
# can change this behavior by adding (model specific) padding. # can change this behavior by adding (model specific) padding.
...@@ -139,7 +139,10 @@ def mask_tokens(inputs, tokenizer, args): ...@@ -139,7 +139,10 @@ def mask_tokens(inputs, tokenizer, args):
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
labels = inputs.clone() labels = inputs.clone()
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool() probability_matrix = torch.full(labels.shape, args.mlm_probability)
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -1 # We only compute loss on masked tokens labels[~masked_indices] = -1 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
......
...@@ -293,7 +293,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): ...@@ -293,7 +293,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
list(filter(None, args.model_name_or_path.split('/'))).pop(), list(filter(None, args.model_name_or_path.split('/'))).pop(),
str(args.max_seq_length), str(args.max_seq_length),
str(task))) str(task)))
if os.path.exists(cached_features_file): if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file) features = torch.load(cached_features_file)
else: else:
...@@ -306,14 +306,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): ...@@ -306,14 +306,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
else: else:
examples = processor.get_train_examples(args.data_dir) examples = processor.get_train_examples(args.data_dir)
logger.info("Training number: %s", str(len(examples))) logger.info("Training number: %s", str(len(examples)))
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, features = convert_examples_to_features(
cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end examples,
cls_token=tokenizer.cls_token, label_list,
sep_token=tokenizer.sep_token, args.max_seq_length,
sep_token_extra=bool(args.model_type in ['roberta']), tokenizer,
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
)
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file) torch.save(features, cached_features_file)
...@@ -362,7 +362,7 @@ def main(): ...@@ -362,7 +362,7 @@ def main():
help="Whether to run eval on the dev set.") help="Whether to run eval on the dev set.")
parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set') parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
parser.add_argument("--evaluate_during_training", action='store_true', parser.add_argument("--evaluate_during_training", action='store_true',
help="Rul evaluation during training at each logging step.") help="Run evaluation during training at each logging step.")
parser.add_argument("--do_lower_case", action='store_true', parser.add_argument("--do_lower_case", action='store_true',
help="Set this flag if you are using an uncased model.") help="Set this flag if you are using an uncased model.")
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" BERT multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """ """ Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
...@@ -26,6 +26,8 @@ import json ...@@ -26,6 +26,8 @@ import json
import csv import csv
import glob import glob
import tqdm import tqdm
from typing import List
from transformers import PreTrainedTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -34,13 +36,13 @@ logger = logging.getLogger(__name__) ...@@ -34,13 +36,13 @@ logger = logging.getLogger(__name__)
class InputExample(object): class InputExample(object):
"""A single training/test example for multiple choice""" """A single training/test example for multiple choice"""
def __init__(self, example_id, question, contexts, endings, label=None): def __init__(self, example_id, question, contexts, endings, label=None):
"""Constructs a InputExample. """Constructs a InputExample.
Args: Args:
example_id: Unique id for the example. example_id: Unique id for the example.
contexts: list of str. The untokenized text of the first sequence (context of corresponding question). contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
question: string. The untokenized text of the second sequence (qustion). question: string. The untokenized text of the second sequence (question).
endings: list of str. multiple choice's options. Its length must be equal to contexts' length. endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
label: (Optional) string. The label of the example. This should be label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples. specified for train and dev examples, but not for test examples.
...@@ -66,7 +68,7 @@ class InputFeatures(object): ...@@ -66,7 +68,7 @@ class InputFeatures(object):
'input_mask': input_mask, 'input_mask': input_mask,
'segment_ids': segment_ids 'segment_ids': segment_ids
} }
for _, input_ids, input_mask, segment_ids in choices_features for input_ids, input_mask, segment_ids in choices_features
] ]
self.label = label self.label = label
...@@ -192,7 +194,7 @@ class SwagProcessor(DataProcessor): ...@@ -192,7 +194,7 @@ class SwagProcessor(DataProcessor):
return lines return lines
def _create_examples(self, lines, type): def _create_examples(self, lines: List[List[str]], type: str):
"""Creates examples for the training and dev sets.""" """Creates examples for the training and dev sets."""
if type == "train" and lines[0][-1] != 'label': if type == "train" and lines[0][-1] != 'label':
raise ValueError( raise ValueError(
...@@ -300,24 +302,18 @@ class ArcProcessor(DataProcessor): ...@@ -300,24 +302,18 @@ class ArcProcessor(DataProcessor):
return examples return examples
def convert_examples_to_features(examples, label_list, max_seq_length, def convert_examples_to_features(
tokenizer, examples: List[InputExample],
cls_token_at_end=False, label_list: List[str],
cls_token='[CLS]', max_length: int,
cls_token_segment_id=1, tokenizer: PreTrainedTokenizer,
sep_token='[SEP]', pad_token_segment_id=0,
sequence_a_segment_id=0, pad_on_left=False,
sequence_b_segment_id=1, pad_token=0,
sep_token_extra=False, mask_padding_with_zero=True,
pad_token_segment_id=0, ) -> List[InputFeatures]:
pad_on_left=False, """
pad_token=0, Loads a data file into a list of `InputFeatures`
mask_padding_with_zero=True):
""" Loads a data file into a list of `InputBatch`s
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
""" """
label_map = {label : i for i, label in enumerate(label_list)} label_map = {label : i for i, label in enumerate(label_list)}
...@@ -328,125 +324,70 @@ def convert_examples_to_features(examples, label_list, max_seq_length, ...@@ -328,125 +324,70 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
logger.info("Writing example %d of %d" % (ex_index, len(examples))) logger.info("Writing example %d of %d" % (ex_index, len(examples)))
choices_features = [] choices_features = []
for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)): for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
tokens_a = tokenizer.tokenize(context) text_a = context
tokens_b = None
if example.question.find("_") != -1: if example.question.find("_") != -1:
#this is for cloze question # this is for cloze question
tokens_b = tokenizer.tokenize(example.question.replace("_", ending)) text_b = example.question.replace("_", ending)
else:
tokens_b = tokenizer.tokenize(example.question + " " + ending)
# you can add seq token between quesiotn and ending. This does not make too much difference.
# tokens_b = tokenizer.tokenize(example.question)
# tokens_b += [sep_token]
# if sep_token_extra:
# tokens_b += [sep_token]
# tokens_b += tokenizer.tokenize(ending)
special_tokens_count = 4 if sep_token_extra else 3
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = tokens_a + [sep_token]
if sep_token_extra:
# roberta uses an extra separator b/w pairs of sentences
tokens += [sep_token]
segment_ids = [sequence_a_segment_id] * len(tokens)
if tokens_b:
tokens += tokens_b + [sep_token]
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
if cls_token_at_end:
tokens = tokens + [cls_token]
segment_ids = segment_ids + [cls_token_segment_id]
else: else:
tokens = [cls_token] + tokens text_b = example.question + " " + ending
segment_ids = [cls_token_segment_id] + segment_ids
input_ids = tokenizer.convert_tokens_to_ids(tokens) inputs = tokenizer.encode_plus(
text_a,
text_b,
add_special_tokens=True,
max_length=max_length,
)
if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
logger.info('Attention! you are cropping tokens (swag task is ok). '
'If you are training ARC and RACE and you are poping question + options,'
'you need to try to use a bigger max seq length!')
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
# The mask has 1 for real tokens and 0 for padding tokens. Only real # The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to. # tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
# Zero-pad up to the sequence length. # Zero-pad up to the sequence length.
padding_length = max_seq_length - len(input_ids) padding_length = max_length - len(input_ids)
if pad_on_left: if pad_on_left:
input_ids = ([pad_token] * padding_length) + input_ids input_ids = ([pad_token] * padding_length) + input_ids
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
else: else:
input_ids = input_ids + ([pad_token] * padding_length) input_ids = input_ids + ([pad_token] * padding_length)
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
assert len(input_ids) == max_length
assert len(attention_mask) == max_length
assert len(token_type_ids) == max_length
choices_features.append((input_ids, attention_mask, token_type_ids))
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
choices_features.append((tokens, input_ids, input_mask, segment_ids))
label = label_map[example.label] label = label_map[example.label]
if ex_index < 2: if ex_index < 2:
logger.info("*** Example ***") logger.info("*** Example ***")
logger.info("race_id: {}".format(example.example_id)) logger.info("race_id: {}".format(example.example_id))
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features): for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
logger.info("choice: {}".format(choice_idx)) logger.info("choice: {}".format(choice_idx))
logger.info("tokens: {}".format(' '.join(tokens)))
logger.info("input_ids: {}".format(' '.join(map(str, input_ids)))) logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
logger.info("input_mask: {}".format(' '.join(map(str, input_mask)))) logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids)))) logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
logger.info("label: {}".format(label)) logger.info("label: {}".format(label))
features.append( features.append(
InputFeatures( InputFeatures(
example_id = example.example_id, example_id=example.example_id,
choices_features = choices_features, choices_features=choices_features,
label = label label=label,
) )
) )
return features return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
# However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
# length or only pop from context
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
logger.info('Attention! you are removing from token_b (swag task is ok). '
'If you are training ARC and RACE (you are poping question + options), '
'you need to try to use a bigger max seq length!')
tokens_b.pop()
processors = { processors = {
...@@ -456,7 +397,7 @@ processors = { ...@@ -456,7 +397,7 @@ processors = {
} }
GLUE_TASKS_NUM_LABELS = { MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
"race", 4, "race", 4,
"swag", 4, "swag", 4,
"arc", 4 "arc", 4
......
...@@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer, ...@@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer,
example.text_b, example.text_b,
add_special_tokens=True, add_special_tokens=True,
max_length=max_length, max_length=max_length,
truncate_first_sequence=True # We're truncating the first sequence in priority
) )
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
......
...@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
text = tokenizer.encode("sequence builders") text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build") text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == [101] + text + [102] assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102]
......
...@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest): ...@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
text = tokenizer.encode("sequence builders") text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build") text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \ assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
......
...@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True) encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == encoded_text_from_decode assert encoded_sentence == encoded_text_from_decode
assert encoded_pair == encoded_pair_from_decode assert encoded_pair == encoded_pair_from_decode
......
...@@ -193,12 +193,12 @@ class CommonTestCases: ...@@ -193,12 +193,12 @@ class CommonTestCases:
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer": if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
seq_0 = "Test this method." seq_0 = "Test this method."
seq_1 = "With these inputs." seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
sequences, mask = information["input_ids"], information["token_type_ids"] sequences, mask = information["input_ids"], information["token_type_ids"]
assert len(sequences) == len(mask) self.assertEqual(len(sequences), len(mask))
def test_number_of_added_tokens(self): def test_number_of_added_tokens(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
...@@ -211,7 +211,7 @@ class CommonTestCases: ...@@ -211,7 +211,7 @@ class CommonTestCases:
# Method is implemented (e.g. not GPT-2) # Method is implemented (e.g. not GPT-2)
if len(attached_sequences) != 2: if len(attached_sequences) != 2:
assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences) self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
def test_maximum_encoding_length_single_input(self): def test_maximum_encoding_length_single_input(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
...@@ -227,10 +227,10 @@ class CommonTestCases: ...@@ -227,10 +227,10 @@ class CommonTestCases:
truncated_sequence = information["input_ids"] truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"] overflowing_tokens = information["overflowing_tokens"]
assert len(overflowing_tokens) == 2 + stride self.assertEqual(len(overflowing_tokens), 2 + stride)
assert overflowing_tokens == sequence[-(2 + stride):] self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
assert len(truncated_sequence) == total_length - 2 self.assertEqual(len(truncated_sequence), total_length - 2)
assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2]) self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
def test_maximum_encoding_length_pair_input(self): def test_maximum_encoding_length_pair_input(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
...@@ -243,26 +243,26 @@ class CommonTestCases: ...@@ -243,26 +243,26 @@ class CommonTestCases:
sequence_1_no_special_tokens = tokenizer.encode(seq_1) sequence_1_no_special_tokens = tokenizer.encode(seq_1)
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair( truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
tokenizer.encode(seq_0), tokenizer.encode(seq_0),
tokenizer.encode(seq_1)[:-2] tokenizer.encode(seq_1)[:-2]
) )
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True, information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
stride=stride, truncate_first_sequence=False) stride=stride, truncation_strategy='only_second')
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
add_special_tokens=True, stride=stride, add_special_tokens=True, stride=stride,
truncate_first_sequence=True) truncation_strategy='only_first')
truncated_sequence = information["input_ids"] truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"] overflowing_tokens = information["overflowing_tokens"]
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"] overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
assert len(overflowing_tokens) == 2 + stride self.assertEqual(len(overflowing_tokens), 2 + stride)
assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):] self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):] self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
assert len(truncated_sequence) == len(sequence) - 2 self.assertEqual(len(truncated_sequence), len(sequence) - 2)
assert truncated_sequence == truncated_second_sequence self.assertEqual(truncated_sequence, truncated_second_sequence)
def test_encode_input_type(self): def test_encode_input_type(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
...@@ -273,5 +273,43 @@ class CommonTestCases: ...@@ -273,5 +273,43 @@ class CommonTestCases:
input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens)
formatted_input = tokenizer.encode(sequence, add_special_tokens=True) formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
def test_special_tokens_mask(self):
tokenizer = self.get_tokenizer()
sequence_0 = "Encode this."
sequence_1 = "This one too please."
# Testing single inputs
encoded_sequence = tokenizer.encode(sequence_0)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)
# Testing inputs pairs
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)
# Testing with already existing special tokens
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
...@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
text = tokenizer.encode("sequence builders") text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build") text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == [1] + text + [1] assert encoded_sentence == [1] + text + [1]
assert encoded_pair == [1] + text + [1] + text_2 + [1] assert encoded_pair == [1] + text + [1] + text_2 + [1]
......
...@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
text = tokenizer.encode("sequence builders") text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build") text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == text + [4, 3] assert encoded_sentence == text + [4, 3]
assert encoded_pair == text + [4] + text_2 + [4, 3] assert encoded_pair == text + [4] + text_2 + [4, 3]
......
...@@ -187,33 +187,59 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -187,33 +187,59 @@ class BertTokenizer(PreTrainedTokenizer):
out_string = ' '.join(tokens).replace(' ##', '').strip() out_string = ' '.join(tokens).replace(' ##', '').strip()
return out_string return out_string
def add_special_tokens_single_sequence(self, token_ids): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" """
Adds special tokens to the a sequence for sequence classification tasks. Build model inputs from a sequence or a pair of sequence for sequence classification tasks
A BERT sequence has the following format: [CLS] X [SEP] by concatenating and adding special tokens.
A BERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
""" """
return [self.cls_token_id] + token_ids + [self.sep_token_id] if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence | first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
......
...@@ -84,30 +84,57 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -84,30 +84,57 @@ class RobertaTokenizer(GPT2Tokenizer):
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
def add_special_tokens_single_sequence(self, token_ids): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" """
Adds special tokens to a sequence for sequence classification tasks. Build model inputs from a sequence or a pair of sequence for sequence classification tasks
A RoBERTa sequence has the following format: <s> X </s> by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
""" """
return [self.cls_token_id] + token_ids + [self.sep_token_id] if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s> special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
sep = [self.sep_token_id] if already_has_special_tokens:
cls = [self.cls_token_id] if token_ids_1 is not None:
return cls + token_ids_0 + sep + sep + token_ids_1 + sep raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format: A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence | first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] if token_ids_1 is None:
\ No newline at end of file return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
...@@ -539,15 +539,9 @@ class PreTrainedTokenizer(object): ...@@ -539,15 +539,9 @@ class PreTrainedTokenizer(object):
Returns: Returns:
Number of tokens added to sequences Number of tokens added to sequences
""" """
token_ids_0 = []
if pair: token_ids_1 = []
initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another")) return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
else:
initial_tokens_len = len(self.encode("This is a sequence"))
final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
return final_tokens_len - initial_tokens_len
def add_special_tokens(self, special_tokens_dict): def add_special_tokens(self, special_tokens_dict):
""" """
...@@ -699,7 +693,7 @@ class PreTrainedTokenizer(object): ...@@ -699,7 +693,7 @@ class PreTrainedTokenizer(object):
add_special_tokens=False, add_special_tokens=False,
max_length=None, max_length=None,
stride=0, stride=0,
truncate_first_sequence=True, truncation_strategy='longest_first',
return_tensors=None, return_tensors=None,
**kwargs): **kwargs):
""" """
...@@ -719,9 +713,13 @@ class PreTrainedTokenizer(object): ...@@ -719,9 +713,13 @@ class PreTrainedTokenizer(object):
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defined the number of additional tokens. from the main sequence returned. The value of this argument defines the number of additional tokens.
truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence truncation_strategy: string selected in the following options:
will be truncated. - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers. or PyTorch torch.Tensor instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
...@@ -731,7 +729,7 @@ class PreTrainedTokenizer(object): ...@@ -731,7 +729,7 @@ class PreTrainedTokenizer(object):
max_length=max_length, max_length=max_length,
add_special_tokens=add_special_tokens, add_special_tokens=add_special_tokens,
stride=stride, stride=stride,
truncate_first_sequence=truncate_first_sequence, truncation_strategy=truncation_strategy,
return_tensors=return_tensors, return_tensors=return_tensors,
**kwargs) **kwargs)
...@@ -743,7 +741,7 @@ class PreTrainedTokenizer(object): ...@@ -743,7 +741,7 @@ class PreTrainedTokenizer(object):
add_special_tokens=False, add_special_tokens=False,
max_length=None, max_length=None,
stride=0, stride=0,
truncate_first_sequence=True, truncation_strategy='longest_first',
return_tensors=None, return_tensors=None,
**kwargs): **kwargs):
""" """
...@@ -762,9 +760,13 @@ class PreTrainedTokenizer(object): ...@@ -762,9 +760,13 @@ class PreTrainedTokenizer(object):
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defined the number of additional tokens. from the main sequence returned. The value of this argument defines the number of additional tokens.
truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence truncation_strategy: string selected in the following options:
will be truncated. - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers. or PyTorch torch.Tensor instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
...@@ -788,12 +790,11 @@ class PreTrainedTokenizer(object): ...@@ -788,12 +790,11 @@ class PreTrainedTokenizer(object):
max_length=max_length, max_length=max_length,
add_special_tokens=add_special_tokens, add_special_tokens=add_special_tokens,
stride=stride, stride=stride,
truncate_first_sequence=truncate_first_sequence, truncation_strategy=truncation_strategy,
return_tensors=return_tensors) return_tensors=return_tensors)
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
truncate_first_sequence=True, return_tensors=None): truncation_strategy='longest_first', return_tensors=None):
""" """
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates It adds special tokens, truncates
...@@ -810,41 +811,50 @@ class PreTrainedTokenizer(object): ...@@ -810,41 +811,50 @@ class PreTrainedTokenizer(object):
to their model. to their model.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs. list of inputs.
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided, truncation_strategy: string selected in the following options:
alongside a specified `max_length`, will truncate the first sequence if the total size is superior - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
than the specified `max_length`. If set to `False`, will truncate the second sequence instead. starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers. or PyTorch torch.Tensor instead of a list of python integers.
Return: Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given. A Dictionary of shape::
{
input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
}
With the fields:
``input_ids``: list of tokens to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
""" """
pair = bool(pair_ids is not None) pair = bool(pair_ids is not None)
len_ids = len(ids) len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0 len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {} encoded_inputs = {}
if max_length: total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0 if max_length and total_len > max_length:
if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
logger.warning( num_tokens_to_remove=total_len-max_length,
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length." truncation_strategy=truncation_strategy,
"This pair of sequences will not be truncated.") stride=stride)
else: encoded_inputs["overflowing_tokens"] = overflowing_tokens
if n_added_tokens + len_ids + len_pair_ids > max_length: encoded_inputs["num_truncated_tokens"] = total_len - max_length
if truncate_first_sequence or not pair:
encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
ids = ids[:max_length - len_pair_ids - n_added_tokens]
elif not truncate_first_sequence and pair:
encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
else:
logger.warning(
"Cannot truncate second sequence as it is not provided. No truncation.")
if add_special_tokens: if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
...@@ -861,20 +871,89 @@ class PreTrainedTokenizer(object): ...@@ -861,20 +871,89 @@ class PreTrainedTokenizer(object):
encoded_inputs["input_ids"] = sequence encoded_inputs["input_ids"] = sequence
encoded_inputs["token_type_ids"] = token_type_ids encoded_inputs["token_type_ids"] = token_type_ids
if max_length and len(encoded_inputs["input_ids"]) > max_length:
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
return encoded_inputs return encoded_inputs
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
"""Truncates a sequence pair in place to the maximum length.
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences).
Overflowing tokens only contains overflow from the first sequence.
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
"""
if num_tokens_to_remove <= 0:
return ids, pair_ids, []
if truncation_strategy == 'longest_first':
overflowing_tokens = []
for _ in range(num_tokens_to_remove):
if pair_ids is None or len(ids) > len(pair_ids):
overflowing_tokens = [ids[-1]] + overflowing_tokens
ids = ids[:-1]
else:
pair_ids = pair_ids[:-1]
window_len = min(len(ids), stride)
if window_len > 0:
overflowing_tokens = ids[-window_len:] + overflowing_tokens
elif truncation_strategy == 'only_first':
assert len(ids) > num_tokens_to_remove
window_len = min(len(ids), stride + num_tokens_to_remove)
overflowing_tokens = ids[-window_len:]
ids = ids[:-num_tokens_to_remove]
elif truncation_strategy == 'only_second':
assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
window_len = min(len(pair_ids), stride + num_tokens_to_remove)
overflowing_tokens = pair_ids[-window_len:]
pair_ids = pair_ids[:-num_tokens_to_remove]
elif truncation_strategy == 'do_not_truncate':
raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
else:
raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
return (ids, pair_ids, overflowing_tokens)
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
logger.warning("This tokenizer does not make use of special tokens.") logger.warning("This tokenizer does not make use of special tokens.")
if token_ids_1 is None:
return len(token_ids_0) * [0]
return [0] * len(token_ids_0) + [1] * len(token_ids_1) return [0] * len(token_ids_0) + [1] * len(token_ids_1)
def add_special_tokens_single_sequence(self, token_ids): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.") """
return token_ids Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): A RoBERTa sequence has the following format:
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1 return token_ids_0 + token_ids_1
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token " """ Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
......
...@@ -754,32 +754,59 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -754,32 +754,59 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace('</w>', ' ').strip() out_string = ''.join(tokens).replace('</w>', ' ').strip()
return out_string return out_string
def add_special_tokens_single_sequence(self, token_ids): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" """
Adds special tokens to a sequence for sequence classification tasks. Build model inputs from a sequence or a pair of sequence for sequence classification tasks
An XLM sequence has the following format: [CLS] X [SEP] by concatenating and adding special tokens.
""" A RoBERTa sequence has the following format:
return [self.cls_token_id] + token_ids + [self.sep_token_id] single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
""" """
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format: An XLM sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence | first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
......
...@@ -181,36 +181,61 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -181,36 +181,61 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string return out_string
def add_special_tokens_single_sequence(self, token_ids): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" """
Adds special tokens to a sequence for sequence classification tasks. Build model inputs from a sequence or a pair of sequence for sequence classification tasks
An XLNet sequence has the following format: X [SEP][CLS] by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
return token_ids + sep + cls if token_ids_1 is None:
return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS] special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
sep = [self.sep_token_id] if already_has_special_tokens:
cls = [self.cls_token_id] if token_ids_1 is not None:
return token_ids_0 + sep + token_ids_1 + sep + cls raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID | first sequence | second sequence | CLS segment ID
if token_ids_1 is None, only returns the first portion of the mask (0's).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
cls_segment_id = [2] cls_segment_id = [2]
if token_ids_1 is None:
return len(token_ids_0 + sep + cls) * [0]
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment