Unverified Commit c52b515e authored by Mike Salvatore's avatar Mike Salvatore Committed by GitHub
Browse files

Fix a typo in tokenizer documentation (#28118)

parent a52e180a
......@@ -185,7 +185,7 @@ class JukeboxTokenizer(PreTrainedTokenizer):
def _tokenize(self, lyrics):
"""
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
......
......@@ -281,7 +281,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
def _tokenize(self, text, **kwargs):
"""
Converts a string in a sequence of tokens (string), using the tokenizer.
Converts a string into a sequence of tokens (string), using the tokenizer.
"""
if self.do_lower_case:
text = text.upper()
......
......@@ -247,7 +247,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
def _tokenize(self, text, **kwargs):
"""
Converts a string in a sequence of tokens (string), using the tokenizer.
Converts a string into a sequence of tokens (string), using the tokenizer.
"""
# make sure whitespace is stripped to prevent <unk>
......
......@@ -540,7 +540,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
"""
Converts a string in a sequence of tokens, using the tokenizer.
Converts a string into a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
......@@ -620,7 +620,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
def _tokenize(self, text, **kwargs):
"""
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Do NOT take care of added tokens.
......
......@@ -2515,7 +2515,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
"""
Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
Converts a string into a sequence of tokens, replacing unknown tokens with the `unk_token`.
Args:
text (`str`):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment