"...resnet50_tensorflow.git" did not exist on "18d627910061a815f68a266c31bc3c21b361e4bb"
Commit bac332fe authored by LysandreJik's avatar LysandreJik
Browse files

Updated the GLUE data processor. Corrections to RoBERTa and XLNet.

parent c3df2136
......@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
special_tokens_count = 4 if sep_token_extra else 3
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
else:
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count = 3 if sep_token_extra else 2
if len(tokens_a) > max_seq_length - special_tokens_count:
tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = tokens_a + [sep_token]
if sep_token_extra:
# roberta uses an extra separator b/w pairs of sentences
tokens += [sep_token]
segment_ids = [sequence_a_segment_id] * len(tokens)
if tokens_b:
tokens += tokens_b + [sep_token]
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
if cls_token_at_end:
tokens = tokens + [cls_token]
segment_ids = segment_ids + [cls_token_segment_id]
else:
tokens = [cls_token] + tokens
segment_ids = [cls_token_segment_id] + segment_ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids, input_mask = tokenizer.encode(example.text_a, example.text_b, add_special_tokens=True, output_mask=True)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
......@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
if ex_index < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join(
[str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
......
......@@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer):
if output_mask:
return (
cls + token_ids_0 + sep + sep + token_ids_1 + sep,
[0] * len(cls + token_ids_0 + sep) + [1] * len(sep + token_ids_1 + sep)
[0] * len(cls + token_ids_0 + sep + sep) + [1] * len(token_ids_1 + sep)
)
else:
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
......@@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id]
cls = [self.cls_token_id]
cls_segment_ids = [2]
if output_mask:
return (
token_ids_0 + sep + token_ids_1 + sep + cls,
[0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep + cls)
[0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep) + cls_segment_ids
)
else:
return token_ids_0 + sep + token_ids_1 + sep + cls
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment