Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1e82cd84
Commit
1e82cd84
authored
Jan 31, 2020
by
Lysandre
Browse files
Flaubert auto tokenizer + tests
cc @julien-c
parent
d18d47be
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
53 additions
and
2 deletions
+53
-2
src/transformers/configuration_flaubert.py
src/transformers/configuration_flaubert.py
+2
-2
src/transformers/tokenization_auto.py
src/transformers/tokenization_auto.py
+3
-0
tests/test_modeling_auto.py
tests/test_modeling_auto.py
+31
-0
tests/test_tokenization_auto.py
tests/test_tokenization_auto.py
+17
-0
No files found.
src/transformers/configuration_flaubert.py
View file @
1e82cd84
...
@@ -50,8 +50,8 @@ class FlaubertConfig(XLMConfig):
...
@@ -50,8 +50,8 @@ class FlaubertConfig(XLMConfig):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
with Structured Dropout. ICLR 2020)
with Structured Dropout. ICLR 2020)
vocab_size (:obj:`int`, optional, defaults to 30145):
vocab_size (:obj:`int`, optional, defaults to 30145):
Vocabulary size of the
XLM
model. Defines the different tokens that
Vocabulary size of the
Flaubert
model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.
XLM
Model`.
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.
Flaubert
Model`.
emb_dim (:obj:`int`, optional, defaults to 2048):
emb_dim (:obj:`int`, optional, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
Dimensionality of the encoder layers and the pooler layer.
n_layer (:obj:`int`, optional, defaults to 12):
n_layer (:obj:`int`, optional, defaults to 12):
...
...
src/transformers/tokenization_auto.py
View file @
1e82cd84
...
@@ -25,6 +25,7 @@ from .configuration_auto import (
...
@@ -25,6 +25,7 @@ from .configuration_auto import (
CamembertConfig
,
CamembertConfig
,
CTRLConfig
,
CTRLConfig
,
DistilBertConfig
,
DistilBertConfig
,
FlaubertConfig
,
GPT2Config
,
GPT2Config
,
OpenAIGPTConfig
,
OpenAIGPTConfig
,
RobertaConfig
,
RobertaConfig
,
...
@@ -41,6 +42,7 @@ from .tokenization_bert_japanese import BertJapaneseTokenizer
...
@@ -41,6 +42,7 @@ from .tokenization_bert_japanese import BertJapaneseTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_flaubert
import
FlaubertTokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
...
@@ -67,6 +69,7 @@ TOKENIZER_MAPPING = OrderedDict(
...
@@ -67,6 +69,7 @@ TOKENIZER_MAPPING = OrderedDict(
(
GPT2Config
,
GPT2Tokenizer
),
(
GPT2Config
,
GPT2Tokenizer
),
(
TransfoXLConfig
,
TransfoXLTokenizer
),
(
TransfoXLConfig
,
TransfoXLTokenizer
),
(
XLNetConfig
,
XLNetTokenizer
),
(
XLNetConfig
,
XLNetTokenizer
),
(
FlaubertConfig
,
FlaubertTokenizer
),
(
XLMConfig
,
XLMTokenizer
),
(
XLMConfig
,
XLMTokenizer
),
(
CTRLConfig
,
CTRLTokenizer
),
(
CTRLConfig
,
CTRLTokenizer
),
]
]
...
...
tests/test_modeling_auto.py
View file @
1e82cd84
...
@@ -39,6 +39,14 @@ if is_torch_available():
...
@@ -39,6 +39,14 @@ if is_torch_available():
BertForQuestionAnswering
,
BertForQuestionAnswering
,
)
)
from
transformers.modeling_bert
import
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from
transformers.modeling_bert
import
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from
transformers.modeling_auto
import
(
MODEL_MAPPING
,
MODEL_FOR_PRETRAINING_MAPPING
,
MODEL_FOR_QUESTION_ANSWERING_MAPPING
,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
,
MODEL_WITH_LM_HEAD_MAPPING
,
)
@
require_torch
@
require_torch
...
@@ -127,3 +135,26 @@ class AutoModelTest(unittest.TestCase):
...
@@ -127,3 +135,26 @@ class AutoModelTest(unittest.TestCase):
self
.
assertIsInstance
(
model
,
RobertaForMaskedLM
)
self
.
assertIsInstance
(
model
,
RobertaForMaskedLM
)
self
.
assertEqual
(
model
.
num_parameters
(),
14830
)
self
.
assertEqual
(
model
.
num_parameters
(),
14830
)
self
.
assertEqual
(
model
.
num_parameters
(
only_trainable
=
True
),
14830
)
self
.
assertEqual
(
model
.
num_parameters
(
only_trainable
=
True
),
14830
)
def
test_parents_and_children_in_mappings
(
self
):
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
# by the parents and will return the wrong configuration type when using auto models
mappings
=
(
MODEL_MAPPING
,
MODEL_FOR_PRETRAINING_MAPPING
,
MODEL_FOR_QUESTION_ANSWERING_MAPPING
,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
,
MODEL_WITH_LM_HEAD_MAPPING
,
)
for
mapping
in
mappings
:
mapping
=
tuple
(
mapping
.
items
())
for
index
,
(
child_config
,
child_model
)
in
enumerate
(
mapping
[
1
:]):
for
parent_config
,
parent_model
in
mapping
[:
index
+
1
]:
with
self
.
subTest
(
msg
=
"Testing if {} is child of {}"
.
format
(
child_config
.
__name__
,
parent_config
.
__name__
)
):
self
.
assertFalse
(
issubclass
(
child_config
,
parent_config
))
self
.
assertFalse
(
issubclass
(
child_model
,
parent_model
))
tests/test_tokenization_auto.py
View file @
1e82cd84
...
@@ -25,6 +25,7 @@ from transformers import (
...
@@ -25,6 +25,7 @@ from transformers import (
GPT2Tokenizer
,
GPT2Tokenizer
,
RobertaTokenizer
,
RobertaTokenizer
,
)
)
from
transformers.tokenization_auto
import
TOKENIZER_MAPPING
from
.utils
import
DUMMY_UNKWOWN_IDENTIFIER
,
SMALL_MODEL_IDENTIFIER
,
slow
# noqa: F401
from
.utils
import
DUMMY_UNKWOWN_IDENTIFIER
,
SMALL_MODEL_IDENTIFIER
,
slow
# noqa: F401
...
@@ -70,3 +71,19 @@ class AutoTokenizerTest(unittest.TestCase):
...
@@ -70,3 +71,19 @@ class AutoTokenizerTest(unittest.TestCase):
for
tokenizer_class
in
[
BertTokenizer
,
AutoTokenizer
]:
for
tokenizer_class
in
[
BertTokenizer
,
AutoTokenizer
]:
with
self
.
assertRaises
(
EnvironmentError
):
with
self
.
assertRaises
(
EnvironmentError
):
_
=
tokenizer_class
.
from_pretrained
(
"julien-c/herlolip-not-exists"
)
_
=
tokenizer_class
.
from_pretrained
(
"julien-c/herlolip-not-exists"
)
def
test_parents_and_children_in_mappings
(
self
):
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
# by the parents and will return the wrong configuration type when using auto models
mappings
=
(
TOKENIZER_MAPPING
,)
for
mapping
in
mappings
:
mapping
=
tuple
(
mapping
.
items
())
for
index
,
(
child_config
,
child_model
)
in
enumerate
(
mapping
[
1
:]):
for
parent_config
,
parent_model
in
mapping
[:
index
+
1
]:
with
self
.
subTest
(
msg
=
"Testing if {} is child of {}"
.
format
(
child_config
.
__name__
,
parent_config
.
__name__
)
):
self
.
assertFalse
(
issubclass
(
child_config
,
parent_config
))
self
.
assertFalse
(
issubclass
(
child_model
,
parent_model
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment