Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
30ede899
"vscode:/vscode.git/clone" did not exist on "232c898f9fd1eec956e7d3f9fbc78999992c5b4a"
Unverified
Commit
30ede899
authored
Apr 30, 2021
by
Shubham Sanghavi
Committed by
GitHub
Apr 30, 2021
Browse files
Implement Fast Tokenization for Deberta (#11387)
parent
db9dd09c
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
271 additions
and
5 deletions
+271
-5
docs/source/index.rst
docs/source/index.rst
+1
-1
docs/source/model_doc/deberta.rst
docs/source/model_doc/deberta.rst
+6
-0
src/transformers/__init__.py
src/transformers/__init__.py
+2
-0
src/transformers/convert_slow_tokenizer.py
src/transformers/convert_slow_tokenizer.py
+32
-0
src/transformers/models/auto/tokenization_auto.py
src/transformers/models/auto/tokenization_auto.py
+4
-1
src/transformers/models/deberta/__init__.py
src/transformers/models/deberta/__init__.py
+7
-1
src/transformers/models/deberta/tokenization_deberta_fast.py
src/transformers/models/deberta/tokenization_deberta_fast.py
+207
-0
src/transformers/utils/dummy_tokenizers_objects.py
src/transformers/utils/dummy_tokenizers_objects.py
+9
-0
tests/test_tokenization_deberta.py
tests/test_tokenization_deberta.py
+3
-2
No files found.
docs/source/index.rst
View file @
30ede899
...
...
@@ -284,7 +284,7 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DPR | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DeBERTa | ✅ |
❌
| ✅ | ❌ | ❌ |
| DeBERTa | ✅ |
✅
| ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DeBERTa-v2 | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
...
...
docs/source/model_doc/deberta.rst
View file @
30ede899
...
...
@@ -56,6 +56,12 @@ DebertaTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
DebertaTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaTokenizerFast
:members: build_inputs_with_special_tokens, create_token_type_ids_from_sequences
DebertaModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
...
...
src/transformers/__init__.py
View file @
30ede899
...
...
@@ -315,6 +315,7 @@ if is_tokenizers_available():
_import_structure
[
"models.barthez"
].
append
(
"BarthezTokenizerFast"
)
_import_structure
[
"models.bert"
].
append
(
"BertTokenizerFast"
)
_import_structure
[
"models.camembert"
].
append
(
"CamembertTokenizerFast"
)
_import_structure
[
"models.deberta"
].
append
(
"DebertaTokenizerFast"
)
_import_structure
[
"models.distilbert"
].
append
(
"DistilBertTokenizerFast"
)
_import_structure
[
"models.dpr"
].
extend
(
[
"DPRContextEncoderTokenizerFast"
,
"DPRQuestionEncoderTokenizerFast"
,
"DPRReaderTokenizerFast"
]
...
...
@@ -1661,6 +1662,7 @@ if TYPE_CHECKING:
from
.models.bert
import
BertTokenizerFast
from
.models.camembert
import
CamembertTokenizerFast
from
.models.convbert
import
ConvBertTokenizerFast
from
.models.deberta
import
DebertaTokenizerFast
from
.models.distilbert
import
DistilBertTokenizerFast
from
.models.dpr
import
DPRContextEncoderTokenizerFast
,
DPRQuestionEncoderTokenizerFast
,
DPRReaderTokenizerFast
from
.models.electra
import
ElectraTokenizerFast
...
...
src/transformers/convert_slow_tokenizer.py
View file @
30ede899
...
...
@@ -296,6 +296,37 @@ class RobertaConverter(Converter):
return
tokenizer
class
DebertaConverter
(
Converter
):
def
converted
(
self
)
->
Tokenizer
:
ot
=
self
.
original_tokenizer
vocab
=
ot
.
encoder
merges
=
list
(
ot
.
bpe_ranks
.
keys
())
tokenizer
=
Tokenizer
(
BPE
(
vocab
=
vocab
,
merges
=
merges
,
dropout
=
None
,
continuing_subword_prefix
=
""
,
end_of_word_suffix
=
""
,
fuse_unk
=
False
,
)
)
tokenizer
.
pre_tokenizer
=
pre_tokenizers
.
ByteLevel
(
add_prefix_space
=
ot
.
add_prefix_space
)
tokenizer
.
decoder
=
decoders
.
ByteLevel
()
tokenizer
.
post_processor
=
processors
.
TemplateProcessing
(
single
=
"[CLS]:0 $A:0 [SEP]:0"
,
pair
=
"[CLS]:0 $A:0 [SEP]:0 $B:0 [SEP]:0"
,
special_tokens
=
[
(
"[CLS]"
,
self
.
original_tokenizer
.
convert_tokens_to_ids
(
"[CLS]"
)),
(
"[SEP]"
,
self
.
original_tokenizer
.
convert_tokens_to_ids
(
"[SEP]"
)),
],
)
return
tokenizer
class
SpmConverter
(
Converter
):
def
__init__
(
self
,
*
args
):
requires_backends
(
self
,
"protobuf"
)
...
...
@@ -654,6 +685,7 @@ SLOW_TO_FAST_CONVERTERS = {
"BertTokenizer"
:
BertConverter
,
"CamembertTokenizer"
:
CamembertConverter
,
"ConvBertTokenizer"
:
BertConverter
,
"DebertaTokenizer"
:
DebertaConverter
,
"DistilBertTokenizer"
:
BertConverter
,
"DPRReaderTokenizer"
:
BertConverter
,
"DPRQuestionEncoderTokenizer"
:
BertConverter
,
...
...
src/transformers/models/auto/tokenization_auto.py
View file @
30ede899
...
...
@@ -157,6 +157,7 @@ if is_tokenizers_available():
from
..bert.tokenization_bert_fast
import
BertTokenizerFast
from
..camembert.tokenization_camembert_fast
import
CamembertTokenizerFast
from
..convbert.tokenization_convbert_fast
import
ConvBertTokenizerFast
from
..deberta.tokenization_deberta_fast
import
DebertaTokenizerFast
from
..distilbert.tokenization_distilbert_fast
import
DistilBertTokenizerFast
from
..dpr.tokenization_dpr_fast
import
DPRQuestionEncoderTokenizerFast
from
..electra.tokenization_electra_fast
import
ElectraTokenizerFast
...
...
@@ -181,6 +182,7 @@ if is_tokenizers_available():
from
..t5.tokenization_t5_fast
import
T5TokenizerFast
from
..xlm_roberta.tokenization_xlm_roberta_fast
import
XLMRobertaTokenizerFast
from
..xlnet.tokenization_xlnet_fast
import
XLNetTokenizerFast
else
:
AlbertTokenizerFast
=
None
BartTokenizerFast
=
None
...
...
@@ -188,6 +190,7 @@ else:
BertTokenizerFast
=
None
CamembertTokenizerFast
=
None
ConvBertTokenizerFast
=
None
DebertaTokenizerFast
=
None
DistilBertTokenizerFast
=
None
DPRQuestionEncoderTokenizerFast
=
None
ElectraTokenizerFast
=
None
...
...
@@ -253,7 +256,7 @@ TOKENIZER_MAPPING = OrderedDict(
(
CTRLConfig
,
(
CTRLTokenizer
,
None
)),
(
FSMTConfig
,
(
FSMTTokenizer
,
None
)),
(
BertGenerationConfig
,
(
BertGenerationTokenizer
,
None
)),
(
DebertaConfig
,
(
DebertaTokenizer
,
None
)),
(
DebertaConfig
,
(
DebertaTokenizer
,
DebertaTokenizerFast
)),
(
DebertaV2Config
,
(
DebertaV2Tokenizer
,
None
)),
(
RagConfig
,
(
RagTokenizer
,
None
)),
(
XLMProphetNetConfig
,
(
XLMProphetNetTokenizer
,
None
)),
...
...
src/transformers/models/deberta/__init__.py
View file @
30ede899
...
...
@@ -18,7 +18,7 @@
from
typing
import
TYPE_CHECKING
from
...file_utils
import
_BaseLazyModule
,
is_torch_available
from
...file_utils
import
_BaseLazyModule
,
is_tokenizers_available
,
is_torch_available
_import_structure
=
{
...
...
@@ -26,6 +26,9 @@ _import_structure = {
"tokenization_deberta"
:
[
"DebertaTokenizer"
],
}
if
is_tokenizers_available
():
_import_structure
[
"tokenization_deberta_fast"
]
=
[
"DebertaTokenizerFast"
]
if
is_torch_available
():
_import_structure
[
"modeling_deberta"
]
=
[
"DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST"
,
...
...
@@ -42,6 +45,9 @@ if TYPE_CHECKING:
from
.configuration_deberta
import
DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
,
DebertaConfig
from
.tokenization_deberta
import
DebertaTokenizer
if
is_tokenizers_available
():
from
.tokenization_deberta_fast
import
DebertaTokenizerFast
if
is_torch_available
():
from
.modeling_deberta
import
(
DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
,
...
...
src/transformers/models/deberta/tokenization_deberta_fast.py
0 → 100644
View file @
30ede899
# coding=utf-8
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fast Tokenization class for model DeBERTa."""
from
typing
import
List
,
Optional
from
...tokenization_utils_base
import
AddedToken
from
...utils
import
logging
from
..gpt2.tokenization_gpt2_fast
import
GPT2TokenizerFast
from
.tokenization_deberta
import
DebertaTokenizer
logger
=
logging
.
get_logger
(
__name__
)
VOCAB_FILES_NAMES
=
{
"vocab_file"
:
"vocab.json"
,
"merges_file"
:
"merges.txt"
,
"tokenizer_file"
:
"tokenizer.json"
}
PRETRAINED_VOCAB_FILES_MAP
=
{
"vocab_file"
:
{
"microsoft/deberta-base"
:
"https://huggingface.co/microsoft/deberta-base/resolve/main/vocab.json"
,
"microsoft/deberta-large"
:
"https://huggingface.co/microsoft/deberta-large/resolve/main/vocab.json"
,
"microsoft/deberta-xlarge"
:
"https://huggingface.co/microsoft/deberta-xlarge/resolve/main/vocab.json"
,
"microsoft/deberta-base-mnli"
:
"https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/vocab.json"
,
"microsoft/deberta-large-mnli"
:
"https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/vocab.json"
,
"microsoft/deberta-xlarge-mnli"
:
"https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/vocab.json"
,
},
"merges_file"
:
{
"microsoft/deberta-base"
:
"https://huggingface.co/microsoft/deberta-base/resolve/main/merges.txt"
,
"microsoft/deberta-large"
:
"https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt"
,
"microsoft/deberta-xlarge"
:
"https://huggingface.co/microsoft/deberta-xlarge/resolve/main/merges.txt"
,
"microsoft/deberta-base-mnli"
:
"https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/merges.txt"
,
"microsoft/deberta-large-mnli"
:
"https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/merges.txt"
,
"microsoft/deberta-xlarge-mnli"
:
"https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/merges.txt"
,
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
"microsoft/deberta-base"
:
512
,
"microsoft/deberta-large"
:
512
,
"microsoft/deberta-xlarge"
:
512
,
"microsoft/deberta-base-mnli"
:
512
,
"microsoft/deberta-large-mnli"
:
512
,
"microsoft/deberta-xlarge-mnli"
:
512
,
}
PRETRAINED_INIT_CONFIGURATION
=
{
"microsoft/deberta-base"
:
{
"do_lower_case"
:
False
},
"microsoft/deberta-large"
:
{
"do_lower_case"
:
False
},
}
class
DebertaTokenizerFast
(
GPT2TokenizerFast
):
"""
Constructs a "fast" DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece. It is
backed by HuggingFace's `tokenizers` library.
Args:
vocab_file (:obj:`str`):
File containing the vocabulary.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names
=
[
"input_ids"
,
"attention_mask"
,
"token_type_ids"
]
slow_tokenizer_class
=
DebertaTokenizer
def
__init__
(
self
,
vocab_file
,
merges_file
,
tokenizer_file
=
None
,
errors
=
"replace"
,
bos_token
=
"[CLS]"
,
eos_token
=
"[SEP]"
,
sep_token
=
"[SEP]"
,
cls_token
=
"[CLS]"
,
unk_token
=
"[UNK]"
,
pad_token
=
"[PAD]"
,
mask_token
=
"[MASK]"
,
add_prefix_space
=
False
,
**
kwargs
):
super
().
__init__
(
vocab_file
,
merges_file
,
tokenizer_file
=
tokenizer_file
,
errors
=
errors
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
add_prefix_space
=
add_prefix_space
,
**
kwargs
,
)
@
property
def
mask_token
(
self
)
->
str
:
"""
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
comprise the space before the `[MASK]`.
"""
if
self
.
_mask_token
is
None
and
self
.
verbose
:
logger
.
error
(
"Using mask_token, but it is not set yet."
)
return
None
return
str
(
self
.
_mask_token
)
@
mask_token
.
setter
def
mask_token
(
self
,
value
):
"""
Overriding the default behavior of the mask token to have it eat the space before it.
"""
# Mask token behave like a normal word, i.e. include the space before it
# So we set lstrip to True
value
=
AddedToken
(
value
,
lstrip
=
True
,
rstrip
=
False
)
if
isinstance
(
value
,
str
)
else
value
self
.
_mask_token
=
value
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A DeBERTa sequence has the following format:
- single sequence: [CLS] X [SEP]
- pair of sequences: [CLS] A [SEP] B [SEP]
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
)
*
[
0
]
src/transformers/utils/dummy_tokenizers_objects.py
View file @
30ede899
...
...
@@ -56,6 +56,15 @@ class ConvBertTokenizerFast:
requires_backends
(
self
,
[
"tokenizers"
])
class
DebertaTokenizerFast
:
def
__init__
(
self
,
*
args
,
**
kwargs
):
requires_backends
(
self
,
[
"tokenizers"
])
@
classmethod
def
from_pretrained
(
self
,
*
args
,
**
kwargs
):
requires_backends
(
self
,
[
"tokenizers"
])
class
DistilBertTokenizerFast
:
def
__init__
(
self
,
*
args
,
**
kwargs
):
requires_backends
(
self
,
[
"tokenizers"
])
...
...
tests/test_tokenization_deberta.py
View file @
30ede899
...
...
@@ -18,7 +18,7 @@ import json
import
os
import
unittest
from
transformers
import
DebertaTokenizer
from
transformers
import
DebertaTokenizer
,
DebertaTokenizerFast
from
transformers.models.deberta.tokenization_deberta
import
VOCAB_FILES_NAMES
from
transformers.testing_utils
import
slow
...
...
@@ -28,7 +28,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class
DebertaTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
DebertaTokenizer
test_rust_tokenizer
=
False
test_rust_tokenizer
=
True
rust_tokenizer_class
=
DebertaTokenizerFast
def
setUp
(
self
):
super
().
setUp
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment