Overwrite get_clean_sequence as this was causing a bottleneck (#13183)

588e6caa · NielsRogge · GitHub · 14373821 · 588e6caa
Unverified Commit 588e6caa authored Aug 23, 2021 by NielsRogge Committed by GitHub Aug 23, 2021
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 0 deletions

tests/test_tokenization_luke.py tests/test_tokenization_luke.py +6 -0

No files found.
--- a/tests/test_tokenization_luke.py
+++ b/tests/test_tokenization_luke.py
@@ -15,6 +15,7 @@


 import unittest
+from typing import Tuple

 from transformers import AddedToken, LukeTokenizer
 from transformers.testing_utils import require_torch, slow
@@ -81,6 +82,11 @@ class Luke(TokenizerTesterMixin, unittest.TestCase):
        assert encoded_sentence == encoded_text_from_decode
        assert encoded_pair == encoded_pair_from_decode

+    def get_clean_sequence(self, tokenizer, max_length=20) -> Tuple[str, list]:
+        txt = "Beyonce lives in Los Angeles"
+        ids = tokenizer.encode(txt, add_special_tokens=False)
+        return txt, ids
+
    def test_space_encoding(self):
        tokenizer = self.get_tokenizer()