Fix a typo in tokenizer documentation (#28118)

c52b515e · Mike Salvatore · GitHub · a52e180a · c52b515e · c52b515e
Unverified Commit c52b515e authored Dec 18, 2023 by Mike Salvatore Committed by GitHub Dec 18, 2023
5 changed files
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -185,7 +185,7 @@ class JukeboxTokenizer(PreTrainedTokenizer):

    def _tokenize(self, lyrics):
        """
-        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.

--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -281,7 +281,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):

    def _tokenize(self, text, **kwargs):
        """
-        Converts a string in a sequence of tokens (string), using the tokenizer.
+        Converts a string into a sequence of tokens (string), using the tokenizer.
        """
        if self.do_lower_case:
            text = text.upper()

--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -247,7 +247,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):

    def _tokenize(self, text, **kwargs):
        """
-        Converts a string in a sequence of tokens (string), using the tokenizer.
+        Converts a string into a sequence of tokens (string), using the tokenizer.
        """

        # make sure whitespace is stripped to prevent <unk>

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -540,7 +540,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):

    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
        """
-        Converts a string in a sequence of tokens, using the tokenizer.
+        Converts a string into a sequence of tokens, using the tokenizer.

        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
@@ -620,7 +620,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):

    def _tokenize(self, text, **kwargs):
        """
-        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2515,7 +2515,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        """
-        Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
+        Converts a string into a sequence of tokens, replacing unknown tokens with the `unk_token`.

        Args:
            text (`str`):