XLM tokenizer should encode with bos token (#3791)

* XLM tokenizer should encode with bos token * Update tests

XLM tokenizer should encode with bos token (#3791)
* XLM tokenizer should encode with bos token * Update tests
8b63a01d · Lysandre Debut · GitHub · 1d4a35b3 · 8b63a01d · 8b63a01d
Unverified Commit 8b63a01d authored Apr 17, 2020 by Lysandre Debut Committed by GitHub Apr 17, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 6 deletions

src/transformers/tokenization_xlm.py src/transformers/tokenization_xlm.py +5 -4

tests/test_tokenization_xlm.py tests/test_tokenization_xlm.py +2 -2

No files found.
--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -873,11 +873,12 @@ class XLMTokenizer(PreTrainedTokenizer):
            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.

        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        bos = [self.bos_token_id]
        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+        if token_ids_1 is None:
+            return bos + token_ids_0 + sep
+        return bos + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False

--- a/tests/test_tokenization_xlm.py
+++ b/tests/test_tokenization_xlm.py
@@ -96,5 +96,5 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)

-        assert encoded_sentence == [1] + text + [1]
-        assert encoded_pair == [1] + text + [1] + text_2 + [1]
+        assert encoded_sentence == [0] + text + [1]
+        assert encoded_pair == [0] + text + [1] + text_2 + [1]