Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c52b515e
Unverified
Commit
c52b515e
authored
Dec 18, 2023
by
Mike Salvatore
Committed by
GitHub
Dec 18, 2023
Browse files
Fix a typo in tokenizer documentation (#28118)
parent
a52e180a
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
6 additions
and
6 deletions
+6
-6
src/transformers/models/jukebox/tokenization_jukebox.py
src/transformers/models/jukebox/tokenization_jukebox.py
+1
-1
src/transformers/models/wav2vec2/tokenization_wav2vec2.py
src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+1
-1
src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
.../models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+1
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+2
-2
src/transformers/tokenization_utils_base.py
src/transformers/tokenization_utils_base.py
+1
-1
No files found.
src/transformers/models/jukebox/tokenization_jukebox.py
View file @
c52b515e
...
@@ -185,7 +185,7 @@ class JukeboxTokenizer(PreTrainedTokenizer):
...
@@ -185,7 +185,7 @@ class JukeboxTokenizer(PreTrainedTokenizer):
def
_tokenize
(
self
,
lyrics
):
def
_tokenize
(
self
,
lyrics
):
"""
"""
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
Converts a string in
to
a sequence of tokens (string), using the tokenizer. Split in words for word-based
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
...
...
src/transformers/models/wav2vec2/tokenization_wav2vec2.py
View file @
c52b515e
...
@@ -281,7 +281,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
...
@@ -281,7 +281,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
def
_tokenize
(
self
,
text
,
**
kwargs
):
def
_tokenize
(
self
,
text
,
**
kwargs
):
"""
"""
Converts a string in a sequence of tokens (string), using the tokenizer.
Converts a string in
to
a sequence of tokens (string), using the tokenizer.
"""
"""
if
self
.
do_lower_case
:
if
self
.
do_lower_case
:
text
=
text
.
upper
()
text
=
text
.
upper
()
...
...
src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
View file @
c52b515e
...
@@ -247,7 +247,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
...
@@ -247,7 +247,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
def
_tokenize
(
self
,
text
,
**
kwargs
):
def
_tokenize
(
self
,
text
,
**
kwargs
):
"""
"""
Converts a string in a sequence of tokens (string), using the tokenizer.
Converts a string in
to
a sequence of tokens (string), using the tokenizer.
"""
"""
# make sure whitespace is stripped to prevent <unk>
# make sure whitespace is stripped to prevent <unk>
...
...
src/transformers/tokenization_utils.py
View file @
c52b515e
...
@@ -540,7 +540,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
...
@@ -540,7 +540,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
def
tokenize
(
self
,
text
:
TextInput
,
**
kwargs
)
->
List
[
str
]:
def
tokenize
(
self
,
text
:
TextInput
,
**
kwargs
)
->
List
[
str
]:
"""
"""
Converts a string in a sequence of tokens, using the tokenizer.
Converts a string in
to
a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
...
@@ -620,7 +620,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
...
@@ -620,7 +620,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
def
_tokenize
(
self
,
text
,
**
kwargs
):
def
_tokenize
(
self
,
text
,
**
kwargs
):
"""
"""
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
Converts a string in
to
a sequence of tokens (string), using the tokenizer. Split in words for word-based
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Do NOT take care of added tokens.
Do NOT take care of added tokens.
...
...
src/transformers/tokenization_utils_base.py
View file @
c52b515e
...
@@ -2515,7 +2515,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
...
@@ -2515,7 +2515,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
def
tokenize
(
self
,
text
:
str
,
pair
:
Optional
[
str
]
=
None
,
add_special_tokens
:
bool
=
False
,
**
kwargs
)
->
List
[
str
]:
def
tokenize
(
self
,
text
:
str
,
pair
:
Optional
[
str
]
=
None
,
add_special_tokens
:
bool
=
False
,
**
kwargs
)
->
List
[
str
]:
"""
"""
Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
Converts a string in
to
a sequence of tokens, replacing unknown tokens with the `unk_token`.
Args:
Args:
text (`str`):
text (`str`):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment