Updated the GLUE data processor. Corrections to RoBERTa and XLNet.

bac332fe · LysandreJik · c3df2136 · bac332fe · bac332fe · bac332fe
Commit bac332fe authored Sep 02, 2019 by LysandreJik
3 changed files
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -415,58 +415,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
-            special_tokens_count = 4 if sep_token_extra else 3
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-        else:
-            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-            special_tokens_count = 3 if sep_token_extra else 2
-            if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = tokens_a + [sep_token]
-        if sep_token_extra:
-            # roberta uses an extra separator b/w pairs of sentences
-            tokens += [sep_token]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        if tokens_b:
-            tokens += tokens_b + [sep_token]
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-
-        if cls_token_at_end:
-            tokens = tokens + [cls_token]
-            segment_ids = segment_ids + [cls_token_segment_id]
-        else:
-            tokens = [cls_token] + tokens
-            segment_ids = [cls_token_segment_id] + segment_ids
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        input_ids, input_mask = tokenizer.encode(example.text_a, example.text_b, add_special_tokens=True, output_mask=True)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
@@ -497,8 +446,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -98,7 +98,7 @@ class RobertaTokenizer(GPT2Tokenizer):
        if output_mask:
            return (
                cls + token_ids_0 + sep + sep + token_ids_1 + sep,
-                [0] * len(cls + token_ids_0 + sep) + [1] * len(sep + token_ids_1 + sep)
+                [0] * len(cls + token_ids_0 + sep + sep) + [1] * len(token_ids_1 + sep)
            )
        else:
            return cls + token_ids_0 + sep + sep + token_ids_1 + sep
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -198,10 +198,11 @@ class XLNetTokenizer(PreTrainedTokenizer):

        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
+        cls_segment_ids = [2]
        if output_mask:
            return (
                token_ids_0 + sep + token_ids_1 + sep + cls,
-                [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep + cls)
+                [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep) + cls_segment_ids
            )
        else:
            return token_ids_0 + sep + token_ids_1 + sep + cls