Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7898fc03
Unverified
Commit
7898fc03
authored
Feb 04, 2021
by
Sylvain Gugger
Committed by
GitHub
Feb 04, 2021
Browse files
Add `from_slow` in fast tokenizers build and fixes some bugs (#9987)
parent
6244727e
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
50 additions
and
38 deletions
+50
-38
src/transformers/models/albert/tokenization_albert.py
src/transformers/models/albert/tokenization_albert.py
+3
-2
src/transformers/models/albert/tokenization_albert_fast.py
src/transformers/models/albert/tokenization_albert_fast.py
+6
-5
src/transformers/models/auto/tokenization_auto.py
src/transformers/models/auto/tokenization_auto.py
+1
-0
src/transformers/models/bart/tokenization_bart.py
src/transformers/models/bart/tokenization_bart.py
+3
-5
src/transformers/models/bart/tokenization_bart_fast.py
src/transformers/models/bart/tokenization_bart_fast.py
+7
-0
src/transformers/models/barthez/tokenization_barthez.py
src/transformers/models/barthez/tokenization_barthez.py
+3
-2
src/transformers/models/barthez/tokenization_barthez_fast.py
src/transformers/models/barthez/tokenization_barthez_fast.py
+0
-3
src/transformers/models/blenderbot/tokenization_blenderbot.py
...transformers/models/blenderbot/tokenization_blenderbot.py
+1
-1
src/transformers/models/camembert/tokenization_camembert.py
src/transformers/models/camembert/tokenization_camembert.py
+3
-2
src/transformers/models/camembert/tokenization_camembert_fast.py
...nsformers/models/camembert/tokenization_camembert_fast.py
+0
-4
src/transformers/models/t5/tokenization_t5.py
src/transformers/models/t5/tokenization_t5.py
+4
-0
src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
...mers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+3
-2
src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
...ansformers/models/xlm_roberta/tokenization_xlm_roberta.py
+3
-2
src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
...rmers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+0
-3
src/transformers/tokenization_utils_base.py
src/transformers/tokenization_utils_base.py
+5
-6
src/transformers/tokenization_utils_fast.py
src/transformers/tokenization_utils_fast.py
+8
-1
No files found.
src/transformers/models/albert/tokenization_albert.py
View file @
7898fc03
...
@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
...
@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
The token used for masking values. This is the token used when training this model with masked language
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
modeling. This is the token which the model will try to predict.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
Attributes:
conversion (string, tokens and IDs).
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/albert/tokenization_albert_fast.py
View file @
7898fc03
...
@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing.
Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note:: When building a sequence using special tokens, this is not the token that is used for the
beginning of sequence. The token used is the :obj:`cls_token`.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`.
that is used for the end of sequence. The token used is the :obj:`sep_token`.
...
@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
instead of per-token classification). It is the first token of the sequence when built with special tokens.
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. Attributes:
modeling. This is the token which the model will try to predict.
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/auto/tokenization_auto.py
View file @
7898fc03
...
@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
...
@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
HerbertTokenizerFast
,
HerbertTokenizerFast
,
PhobertTokenizer
,
PhobertTokenizer
,
BarthezTokenizer
,
BarthezTokenizer
,
BarthezTokenizerFast
,
]
]
...
...
src/transformers/models/bart/tokenization_bart.py
View file @
7898fc03
...
@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
...
@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
r
"""
r
"""
Construct a BART tokenizer.
Construct a BART tokenizer.
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
"""
# merges and vocab same as Roberta
# merges and vocab same as Roberta
max_model_input_sizes
=
{
m
:
1024
for
m
in
_all_bart_models
}
max_model_input_sizes
=
{
m
:
1024
for
m
in
_all_bart_models
}
...
...
src/transformers/models/bart/tokenization_bart_fast.py
View file @
7898fc03
...
@@ -37,6 +37,13 @@ _all_bart_models = [
...
@@ -37,6 +37,13 @@ _all_bart_models = [
class
BartTokenizerFast
(
RobertaTokenizerFast
):
class
BartTokenizerFast
(
RobertaTokenizerFast
):
r
"""
Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta
# merges and vocab same as Roberta
max_model_input_sizes
=
{
m
:
1024
for
m
in
_all_bart_models
}
max_model_input_sizes
=
{
m
:
1024
for
m
in
_all_bart_models
}
pretrained_vocab_files_map
=
{
pretrained_vocab_files_map
=
{
...
...
src/transformers/models/barthez/tokenization_barthez.py
View file @
7898fc03
...
@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
...
@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
Attributes:
conversion (string, tokens and IDs).
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/barthez/tokenization_barthez_fast.py
View file @
7898fc03
...
@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
...
@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/blenderbot/tokenization_blenderbot.py
View file @
7898fc03
...
@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
...
@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
Construct a Blenderbot tokenizer.
Construct a Blenderbot tokenizer.
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn
'
t add BOS token
to the beginning of sequences.
to the beginning of sequences.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
...
...
src/transformers/models/camembert/tokenization_camembert.py
View file @
7898fc03
...
@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
Attributes:
conversion (string, tokens and IDs).
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/camembert/tokenization_camembert_fast.py
View file @
7898fc03
...
@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/t5/tokenization_t5.py
View file @
7898fc03
...
@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
...
@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
View file @
7898fc03
...
@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
...
@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
Attributes:
conversion (string, tokens and IDs).
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
View file @
7898fc03
...
@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
...
@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
Attributes:
conversion (string, tokens and IDs).
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
View file @
7898fc03
...
@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
...
@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
src/transformers/tokenization_utils_base.py
View file @
7898fc03
...
@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
...
@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def
_from_pretrained
(
def
_from_pretrained
(
cls
,
resolved_vocab_files
,
pretrained_model_name_or_path
,
init_configuration
,
*
init_inputs
,
**
kwargs
cls
,
resolved_vocab_files
,
pretrained_model_name_or_path
,
init_configuration
,
*
init_inputs
,
**
kwargs
):
):
# We instantiate fast tokenizers based on a slow tokenizer for now
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
# In the future we can also use a direct way based on saving/instantiating
# file or if `from_slow` is set to True.
# tokenizer's Tokenizer directly from it's serialization JSON
from_slow
=
kwargs
.
get
(
"from_slow"
,
False
)
if
(
has_tokenizer_file
=
resolved_vocab_files
.
get
(
"tokenizer_file"
,
None
)
is
not
None
"tokenizer_file"
not
in
resolved_vocab_files
or
resolved_vocab_files
[
"tokenizer_file"
]
is
None
if
(
from_slow
or
not
has_tokenizer_file
)
and
cls
.
slow_tokenizer_class
is
not
None
:
)
and
cls
.
slow_tokenizer_class
is
not
None
:
slow_tokenizer
=
(
cls
.
slow_tokenizer_class
).
_from_pretrained
(
slow_tokenizer
=
(
cls
.
slow_tokenizer_class
).
_from_pretrained
(
copy
.
deepcopy
(
resolved_vocab_files
),
copy
.
deepcopy
(
resolved_vocab_files
),
pretrained_model_name_or_path
,
pretrained_model_name_or_path
,
...
...
src/transformers/tokenization_utils_fast.py
View file @
7898fc03
...
@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
...
@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
slow_tokenizer
=
kwargs
.
pop
(
"__slow_tokenizer"
,
None
)
slow_tokenizer
=
kwargs
.
pop
(
"__slow_tokenizer"
,
None
)
fast_tokenizer_file
=
kwargs
.
pop
(
"tokenizer_file"
,
None
)
fast_tokenizer_file
=
kwargs
.
pop
(
"tokenizer_file"
,
None
)
from_slow
=
kwargs
.
pop
(
"from_slow"
,
False
)
if
fast_tokenizer_file
is
not
None
:
if
from_slow
and
slow_tokenizer
is
None
and
self
.
slow_tokenizer_class
is
None
:
raise
ValueError
(
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
"have sentencepiece installed."
)
if
fast_tokenizer_file
is
not
None
and
not
from_slow
:
# We have a serialization from tokenizers which let us directly build the backend
# We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer
=
TokenizerFast
.
from_file
(
fast_tokenizer_file
)
fast_tokenizer
=
TokenizerFast
.
from_file
(
fast_tokenizer_file
)
elif
slow_tokenizer
is
not
None
:
elif
slow_tokenizer
is
not
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment