Clarify use of unk_token in tokenizer docstrings (#9875)

99b9affa · Ethan Chau · GitHub · c2d0ffec · 99b9affa · 99b9affa
Unverified Commit 99b9affa authored Jan 29, 2021 by Ethan Chau Committed by GitHub Jan 29, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 11 deletions

src/transformers/tokenization_utils.py src/transformers/tokenization_utils.py +0 -3

src/transformers/tokenization_utils_base.py src/transformers/tokenization_utils_base.py +1 -8

No files found.
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -230,9 +230,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        """
        Converts a string in a sequence of tokens, using the tokenizer.
-        Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method won't replace the unknown
-        tokens with the `unk_token` yet (this is done in the `encode()` method)
        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2043,14 +2043,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        """
-        Converts a string in a sequence of tokens, using the backend Rust tokenizer.
+        Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`.
-        Note that this method behave differently between fast and slow tokenizers:
-            - in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method will
-              replace the unknown tokens with the :obj:`unk_token`,
-            - in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method keep unknown
-              tokens unchanged.
        Args:
            text (:obj:`str`):