Commit 0f091062 authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'glue-example' into tf2

parents c4acc3a8 e4022d96
......@@ -321,11 +321,19 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2:
attn_score = attn_score.float().masked_fill(
attn_mask[None,:,:,None], -1e30).type_as(attn_score)
if next(self.parameters()).dtype == torch.float16:
attn_score = attn_score.float().masked_fill(
attn_mask[None,:,:,None], -65000).type_as(attn_score)
else:
attn_score = attn_score.float().masked_fill(
attn_mask[None,:,:,None], -1e30).type_as(attn_score)
elif attn_mask.dim() == 3:
attn_score = attn_score.float().masked_fill(
attn_mask[:,:,:,None], -1e30).type_as(attn_score)
if next(self.parameters()).dtype == torch.float16:
attn_score = attn_score.float().masked_fill(
attn_mask[:,:,:,None], -65000).type_as(attn_score)
else:
attn_score = attn_score.float().masked_fill(
attn_mask[:,:,:,None], -1e30).type_as(attn_score)
# [qlen x klen x bsz x n_head]
attn_prob = F.softmax(attn_score, dim=1)
......@@ -383,7 +391,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
class AdaptiveEmbedding(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
sample_softmax=False):
super(AdaptiveEmbedding, self).__init__()
......@@ -421,7 +429,7 @@ class AdaptiveEmbedding(nn.Module):
else:
param = next(self.parameters())
inp_flat = inp.view(-1)
emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
dtype=param.dtype, device=param.device)
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
......@@ -547,7 +555,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
class TransfoXLModel(TransfoXLPreTrainedModel):
r"""
......@@ -587,7 +595,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
self.n_head = config.n_head
self.d_head = config.d_head
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
div_val=config.div_val)
self.drop = nn.Dropout(config.dropout)
......@@ -727,7 +735,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
hids = []
attentions = []
if self.attn_type == 0: # default
pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
dtype=word_emb.dtype)
if self.clamp_len > 0:
pos_seq.clamp_(max=self.clamp_len)
......@@ -815,7 +823,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
# use adaptive softmax (including standard softmax)
else:
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
config.cutoffs, div_val=config.div_val)
self.init_weights()
self.tie_weights()
......
......@@ -140,7 +140,7 @@ class PreTrainedModel(nn.Module):
Arguments:
new_num_tokens: (`optional`) int:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
Return: ``torch.nn.Embeddings``
......@@ -457,7 +457,10 @@ class PoolerStartLogits(nn.Module):
x = self.dense(hidden_states).squeeze(-1)
if p_mask is not None:
x = x * (1 - p_mask) - 1e30 * p_mask
if next(self.parameters()).dtype == torch.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x
......
......@@ -313,7 +313,7 @@ XLM_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMModel(XLMPreTrainedModel):
r"""
......
......@@ -502,6 +502,12 @@ XLNET_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
the important thing is that they should be different for tokens which belong to different segments.
The model will compute relative segment differences from the given type indices:
0 if the segment id of two tokens are the same, 1 if not.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
......@@ -542,7 +548,7 @@ XLNET_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetModel(XLNetPreTrainedModel):
r"""
......@@ -739,8 +745,9 @@ class XLNetModel(XLNetPreTrainedModel):
if data_mask is not None:
# all mems can be attended to
mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
data_mask = torch.cat([mems_mask, data_mask], dim=1)
if mlen > 0:
mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
data_mask = torch.cat([mems_mask, data_mask], dim=1)
if attn_mask is None:
attn_mask = data_mask[:, :, :, None]
else:
......@@ -751,7 +758,8 @@ class XLNetModel(XLNetPreTrainedModel):
if attn_mask is not None:
non_tgt_mask = -torch.eye(qlen).to(attn_mask)
non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
if mlen > 0:
non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask)
else:
non_tgt_mask = None
......@@ -771,8 +779,11 @@ class XLNetModel(XLNetPreTrainedModel):
##### Segment embedding
if token_type_ids is not None:
# Convert `token_type_ids` to one-hot `seg_mat`
mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
if mlen > 0:
mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
else:
cat_ids = token_type_ids
# `1` indicates not in the same segment [qlen x klen x bsz]
seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
......@@ -1002,6 +1013,97 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
return outputs # return (loss), logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetForMultipleChoice(XLNetPreTrainedModel):
r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to scores.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
def __init__(self, config):
super(XLNetForMultipleChoice, self).__init__(config)
self.transformer = XLNetModel(config)
self.sequence_summary = SequenceSummary(config)
self.logits_proj = nn.Linear(config.d_model, 1)
self.init_weights()
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None,
labels=None, head_mask=None):
num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
input_mask=flat_input_mask, attention_mask=flat_attention_mask,
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
head_mask=head_mask)
output = transformer_outputs[0]
output = self.sequence_summary(output)
logits = self.logits_proj(output)
reshaped_logits = logits.view(-1, num_choices)
outputs = (reshaped_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels.view(-1))
outputs = (loss,) + outputs
return outputs # return (loss), logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
......@@ -1152,7 +1254,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
Examples::
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
......
......@@ -179,8 +179,9 @@ class CommonTestCases:
if not self.test_head_masking:
return
torch.manual_seed(42)
global_rng.seed(42)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
global_rng.seed()
config.output_attentions = True
config.output_hidden_states = True
......@@ -190,7 +191,7 @@ class CommonTestCases:
model.eval()
# Prepare head_mask
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
head_mask[0, 0] = 0
head_mask[-1, :-1] = 0
......@@ -684,12 +685,13 @@ class ConfigTester(object):
self.create_and_test_config_to_json_file()
global_rng = random.Random()
def ids_tensor(shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size."""
if rng is None:
rng = random.Random()
rng = global_rng
total_dims = 1
for dim in shape:
......
......@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102]
......
......@@ -36,11 +36,13 @@ class DistilBertTokenizationTest(BertTokenizationTest):
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
text_2 + [tokenizer.sep_token_id]
assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102]
if __name__ == '__main__':
unittest.main()
......@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
assert encoded_sentence == encoded_text_from_decode
assert encoded_pair == encoded_pair_from_decode
......
......@@ -55,6 +55,22 @@ class CommonTestCases:
def get_input_output_texts(self):
raise NotImplementedError
def test_tokenizers_common_properties(self):
tokenizer = self.get_tokenizer()
attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
"pad_token", "cls_token", "mask_token"]
for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr))
self.assertTrue(hasattr(tokenizer, attr + "_id"))
self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
"added_tokens_decoder"]
for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr))
def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works
tokenizer = self.get_tokenizer()
......@@ -170,3 +186,92 @@ class CommonTestCases:
for weights_list_2 in weights_lists_2:
self.assertListEqual(weights_list, weights_list_2)
def test_mask_output(self):
if sys.version_info <= (3, 0):
return
tokenizer = self.get_tokenizer()
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
seq_0 = "Test this method."
seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
sequences, mask = information["input_ids"], information["token_type_ids"]
assert len(sequences) == len(mask)
def test_number_of_added_tokens(self):
tokenizer = self.get_tokenizer()
seq_0 = "Test this method."
seq_1 = "With these inputs."
sequences = tokenizer.encode(seq_0, seq_1)
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
# Method is implemented (e.g. not GPT-2)
if len(attached_sequences) != 2:
assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences)
def test_maximum_encoding_length_single_input(self):
tokenizer = self.get_tokenizer()
seq_0 = "This is a sentence to be encoded."
stride = 2
sequence = tokenizer.encode(seq_0)
num_added_tokens = tokenizer.num_added_tokens()
total_length = len(sequence) + num_added_tokens
information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
assert len(overflowing_tokens) == 2 + stride
assert overflowing_tokens == sequence[-(2 + stride):]
assert len(truncated_sequence) == total_length - 2
assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
def test_maximum_encoding_length_pair_input(self):
tokenizer = self.get_tokenizer()
seq_0 = "This is a sentence to be encoded."
seq_1 = "This is another sentence to be encoded."
stride = 2
sequence_0_no_special_tokens = tokenizer.encode(seq_0)
sequence_1_no_special_tokens = tokenizer.encode(seq_1)
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair(
tokenizer.encode(seq_0),
tokenizer.encode(seq_1)[:-2]
)
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
stride=stride, truncate_first_sequence=False)
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
add_special_tokens=True, stride=stride,
truncate_first_sequence=True)
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
assert len(overflowing_tokens) == 2 + stride
assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):]
assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):]
assert len(truncated_sequence) == len(sequence) - 2
assert truncated_sequence == truncated_second_sequence
def test_encode_input_type(self):
tokenizer = self.get_tokenizer()
sequence = "Let's encode this sequence"
tokens = tokenizer.tokenize(sequence)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
......@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
assert encoded_sentence == [1] + text + [1]
assert encoded_pair == [1] + text + [1] + text_2 + [1]
......
......@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
assert encoded_sentence == text + [4, 3]
assert encoded_pair == text + [4] + text_2 + [4, 3]
......
......@@ -187,22 +187,35 @@ class BertTokenizer(PreTrainedTokenizer):
out_string = ' '.join(tokens).replace(' ##', '').strip()
return out_string
def add_special_tokens_single_sentence(self, token_ids):
def add_special_tokens_single_sequence(self, token_ids):
"""
Adds special tokens to the a sequence for sequence classification tasks.
A BERT sequence has the following format: [CLS] X [SEP]
"""
return [self.cls_token_id] + token_ids + [self.sep_token_id]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file."""
index = 0
......
......@@ -81,14 +81,14 @@ class RobertaTokenizer(GPT2Tokenizer):
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, **kwargs)
def add_special_tokens_single_sentence(self, token_ids):
def add_special_tokens_single_sequence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.
A RoBERTa sequence has the following format: <s> X </s>
"""
return [self.cls_token_id] + token_ids + [self.sep_token_id]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
......@@ -96,3 +96,15 @@ class RobertaTokenizer(GPT2Tokenizer):
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
\ No newline at end of file
......@@ -165,58 +165,42 @@ class PreTrainedTokenizer(object):
@property
def bos_token_id(self):
""" Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
if self._bos_token is None:
logger.error("Using bos_token, but it is not set yet.")
return self.convert_tokens_to_ids(self._bos_token)
return self.convert_tokens_to_ids(self.bos_token)
@property
def eos_token_id(self):
""" Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
if self._eos_token is None:
logger.error("Using eos_token, but it is not set yet.")
return self.convert_tokens_to_ids(self._eos_token)
return self.convert_tokens_to_ids(self.eos_token)
@property
def unk_token_is(self):
def unk_token_id(self):
""" Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
if self._unk_token is None:
logger.error("Using unk_token, but it is not set yet.")
return self.convert_tokens_to_ids(self._unk_token)
return self.convert_tokens_to_ids(self.unk_token)
@property
def sep_token_id(self):
""" Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
if self._sep_token is None:
logger.error("Using sep_token, but it is not set yet.")
return self.convert_tokens_to_ids(self._sep_token)
return self.convert_tokens_to_ids(self.sep_token)
@property
def pad_token_id(self):
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """
if self._pad_token is None:
logger.error("Using pad_token, but it is not set yet.")
return self.convert_tokens_to_ids(self._pad_token)
return self.convert_tokens_to_ids(self.pad_token)
@property
def cls_token_id(self):
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
if self._cls_token is None:
logger.error("Using cls_token, but it is not set yet.")
return self.convert_tokens_to_ids(self._cls_token)
return self.convert_tokens_to_ids(self.cls_token)
@property
def mask_token_id(self):
""" Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
if self._mask_token is None:
logger.error("Using mask_token, but it is not set yet.")
return self.convert_tokens_to_ids(self._mask_token)
return self.convert_tokens_to_ids(self.mask_token)
@property
def additional_special_tokens_ids(self):
""" Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
if self._additional_special_tokens is None:
logger.error("Using additional_special_tokens, but it is not set yet.")
return self.convert_tokens_to_ids(self._additional_special_tokens)
return self.convert_tokens_to_ids(self.additional_special_tokens)
def __init__(self, max_len=None, **kwargs):
self._bos_token = None
......@@ -537,6 +521,30 @@ class PreTrainedTokenizer(object):
return len(to_add_tokens)
def num_added_tokens(self, pair=False):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Note:
This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
inside your training loop.
Args:
pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
number of added tokens in the case of a single sequence if set to False.
Returns:
Number of tokens added to sequences
"""
if pair:
initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
else:
initial_tokens_len = len(self.encode("This is a sequence"))
final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
return final_tokens_len - initial_tokens_len
def add_special_tokens(self, special_tokens_dict):
"""
......@@ -656,6 +664,9 @@ class PreTrainedTokenizer(object):
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
(resp. a sequence of ids), using the vocabulary.
"""
if tokens is None:
return None
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
return self._convert_token_to_id_with_added_voc(tokens)
......@@ -669,6 +680,9 @@ class PreTrainedTokenizer(object):
return ids
def _convert_token_to_id_with_added_voc(self, token):
if token is None:
return None
if token in self.added_tokens_encoder:
return self.added_tokens_encoder[token]
return self._convert_token_to_id(token)
......@@ -679,48 +693,143 @@ class PreTrainedTokenizer(object):
def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
"""
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args:
text: The first sequence to be encoded.
text_pair: Optional second sequence to be encoded.
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
**kwargs: passed to the `self.tokenize()` method
"""
if is_tf_available():
is_tf_tensor = False
if isinstance(text, tf.Tensor):
text = text.numpy()
is_tf_tensor = True
if isinstance(text, bytes):
text = text.decode('utf-8')
if text_pair is None:
if add_special_tokens:
output = self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
encoded_inputs = self.encode_plus(text, text_pair=text_pair, add_special_tokens=add_special_tokens, **kwargs)
return encoded_inputs["input_ids"]
def encode_plus(self,
text,
text_pair=None,
add_special_tokens=False,
max_length=None,
stride=0,
truncate_first_sequence=True,
**kwargs):
"""
Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args:
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defined the number of additional tokens.
truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
will be truncated.
**kwargs: passed to the `self.tokenize()` method
"""
def get_input_ids(text):
if isinstance(text, six.string_types):
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
else:
output = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
else:
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
if add_special_tokens:
output = self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
first_ids = get_input_ids(text)
second_ids = get_input_ids(text_pair) if text_pair is not None else None
return self.prepare_for_model(first_ids,
pair_ids=second_ids,
max_length=max_length,
add_special_tokens=add_special_tokens,
stride=stride,
truncate_first_sequence=truncate_first_sequence)
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, truncate_first_sequence=True):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates
sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
Args:
ids: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
pair = bool(pair_ids is not None)
len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {}
if max_length:
n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
logger.warning(
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
"This pair of sequences will not be truncated.")
else:
output = first_sentence_tokens, second_sentence_tokens
if n_added_tokens + len_ids + len_pair_ids > max_length:
if truncate_first_sequence or not pair:
encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
ids = ids[:max_length - len_pair_ids - n_added_tokens]
elif not truncate_first_sequence and pair:
encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
else:
logger.warning(
"Cannot truncate second sequence as it is not provided. No truncation.")
if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
else:
sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
encoded_inputs["input_ids"] = sequence
encoded_inputs["token_type_ids"] = token_type_ids
if is_tf_available() and is_tf_tensor:
output = tf.constant(output)
return encoded_inputs
return output
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
logger.warning("This tokenizer does not make use of special tokens.")
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
def add_special_tokens_single_sentence(self, token_ids):
def add_special_tokens_single_sequence(self, token_ids):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
return token_ids
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
return token_ids_0 + token_ids_1
......
......@@ -754,14 +754,14 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace('</w>', ' ').strip()
return out_string
def add_special_tokens_single_sentence(self, token_ids):
def add_special_tokens_single_sequence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLM sequence has the following format: [CLS] X [SEP]
"""
return [self.cls_token_id] + token_ids + [self.sep_token_id]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
......@@ -770,6 +770,18 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(save_directory):
......
......@@ -181,7 +181,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string
def add_special_tokens_single_sentence(self, token_ids):
def add_special_tokens_single_sequence(self, token_ids):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
......@@ -190,15 +190,29 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id]
return token_ids + sep + cls
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLNet sequence has the following format: X [SEP][CLS]
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return token_ids_0 + sep + token_ids_1 + sep + cls
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
cls_segment_id = [2]
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment