Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5340d1f2
Unverified
Commit
5340d1f2
authored
Nov 27, 2019
by
Thomas Wolf
Committed by
GitHub
Nov 27, 2019
Browse files
Merge branch 'master' into resumable_http
parents
0e4cc050
10bd1ddb
Changes
131
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
684 additions
and
66 deletions
+684
-66
transformers/tests/modeling_auto_test.py
transformers/tests/modeling_auto_test.py
+4
-0
transformers/tests/modeling_bert_test.py
transformers/tests/modeling_bert_test.py
+69
-20
transformers/tests/modeling_common_test.py
transformers/tests/modeling_common_test.py
+50
-4
transformers/tests/modeling_distilbert_test.py
transformers/tests/modeling_distilbert_test.py
+20
-0
transformers/tests/modeling_encoder_decoder_test.py
transformers/tests/modeling_encoder_decoder_test.py
+53
-0
transformers/tests/modeling_roberta_test.py
transformers/tests/modeling_roberta_test.py
+18
-1
transformers/tests/modeling_tf_albert_test.py
transformers/tests/modeling_tf_albert_test.py
+231
-0
transformers/tests/modeling_tf_bert_test.py
transformers/tests/modeling_tf_bert_test.py
+0
-4
transformers/tests/modeling_tf_common_test.py
transformers/tests/modeling_tf_common_test.py
+39
-0
transformers/tests/modeling_tf_roberta_test.py
transformers/tests/modeling_tf_roberta_test.py
+15
-0
transformers/tests/optimization_test.py
transformers/tests/optimization_test.py
+17
-12
transformers/tests/tokenization_albert_test.py
transformers/tests/tokenization_albert_test.py
+78
-0
transformers/tests/tokenization_auto_test.py
transformers/tests/tokenization_auto_test.py
+1
-0
transformers/tests/tokenization_bert_test.py
transformers/tests/tokenization_bert_test.py
+4
-2
transformers/tests/tokenization_distilbert_test.py
transformers/tests/tokenization_distilbert_test.py
+4
-2
transformers/tests/tokenization_roberta_test.py
transformers/tests/tokenization_roberta_test.py
+6
-4
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+65
-13
transformers/tests/tokenization_utils_test.py
transformers/tests/tokenization_utils_test.py
+2
-0
transformers/tests/tokenization_xlm_test.py
transformers/tests/tokenization_xlm_test.py
+4
-2
transformers/tests/tokenization_xlnet_test.py
transformers/tests/tokenization_xlnet_test.py
+4
-2
No files found.
transformers/tests/modeling_auto_test.py
View file @
5340d1f2
...
@@ -38,6 +38,7 @@ else:
...
@@ -38,6 +38,7 @@ else:
class
AutoModelTest
(
unittest
.
TestCase
):
class
AutoModelTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
def
test_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
@@ -52,6 +53,7 @@ class AutoModelTest(unittest.TestCase):
...
@@ -52,6 +53,7 @@ class AutoModelTest(unittest.TestCase):
for
value
in
loading_info
.
values
():
for
value
in
loading_info
.
values
():
self
.
assertEqual
(
len
(
value
),
0
)
self
.
assertEqual
(
len
(
value
),
0
)
@
pytest
.
mark
.
slow
def
test_lmhead_model_from_pretrained
(
self
):
def
test_lmhead_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
@@ -64,6 +66,7 @@ class AutoModelTest(unittest.TestCase):
...
@@ -64,6 +66,7 @@ class AutoModelTest(unittest.TestCase):
self
.
assertIsNotNone
(
model
)
self
.
assertIsNotNone
(
model
)
self
.
assertIsInstance
(
model
,
BertForMaskedLM
)
self
.
assertIsInstance
(
model
,
BertForMaskedLM
)
@
pytest
.
mark
.
slow
def
test_sequence_classification_model_from_pretrained
(
self
):
def
test_sequence_classification_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
@@ -76,6 +79,7 @@ class AutoModelTest(unittest.TestCase):
...
@@ -76,6 +79,7 @@ class AutoModelTest(unittest.TestCase):
self
.
assertIsNotNone
(
model
)
self
.
assertIsNotNone
(
model
)
self
.
assertIsInstance
(
model
,
BertForSequenceClassification
)
self
.
assertIsInstance
(
model
,
BertForSequenceClassification
)
@
pytest
.
mark
.
slow
def
test_question_answering_model_from_pretrained
(
self
):
def
test_question_answering_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_bert_test.py
View file @
5340d1f2
...
@@ -22,24 +22,25 @@ import pytest
...
@@ -22,24 +22,25 @@ import pytest
from
transformers
import
is_torch_available
from
transformers
import
is_torch_available
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
,
floats_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.configuration_common_test
import
ConfigTester
if
is_torch_available
():
if
is_torch_available
():
from
transformers
import
(
BertConfig
,
BertModel
,
BertForMaskedLM
,
from
transformers
import
(
BertConfig
,
BertModel
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
BertForPreTraining
,
BertForNextSentencePrediction
,
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForTokenClassification
,
BertForMultipleChoice
)
BertForTokenClassification
,
BertForMultipleChoice
)
from
transformers.modeling_bert
import
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from
transformers.modeling_bert
import
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
@
pytest
.
mark
.
usefixtures
(
"use_cuda"
)
class
BertModelTest
(
CommonTestCases
.
CommonModelTester
):
class
BertModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
BertModel
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
all_model_classes
=
(
BertModel
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForTokenClassification
)
if
is_torch_available
()
else
()
BertForTokenClassification
)
if
is_torch_available
()
else
()
class
BertModelTester
(
object
):
class
BertModelTester
(
object
):
...
@@ -66,7 +67,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -66,7 +67,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
num_labels
=
3
,
num_labels
=
3
,
num_choices
=
4
,
num_choices
=
4
,
scope
=
None
,
scope
=
None
,
):
device
=
'cpu'
,
):
self
.
parent
=
parent
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
seq_length
=
seq_length
...
@@ -89,25 +91,26 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -89,25 +91,26 @@ class BertModelTest(CommonTestCases.CommonModelTester):
self
.
num_labels
=
num_labels
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
num_choices
=
num_choices
self
.
scope
=
scope
self
.
scope
=
scope
self
.
device
=
device
def
prepare_config_and_inputs
(
self
):
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
.
to
(
self
.
device
)
input_mask
=
None
input_mask
=
None
if
self
.
use_input_mask
:
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
.
to
(
self
.
device
)
token_type_ids
=
None
token_type_ids
=
None
if
self
.
use_token_type_ids
:
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
.
to
(
self
.
device
)
sequence_labels
=
None
sequence_labels
=
None
token_labels
=
None
token_labels
=
None
choice_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
.
to
(
self
.
device
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
.
to
(
self
.
device
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
.
to
(
self
.
device
)
config
=
BertConfig
(
config
=
BertConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
vocab_size_or_config_json_file
=
self
.
vocab_size
,
...
@@ -120,10 +123,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -120,10 +123,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
attention_probs_dropout_prob
=
self
.
attention_probs_dropout_prob
,
attention_probs_dropout_prob
=
self
.
attention_probs_dropout_prob
,
max_position_embeddings
=
self
.
max_position_embeddings
,
max_position_embeddings
=
self
.
max_position_embeddings
,
type_vocab_size
=
self
.
type_vocab_size
,
type_vocab_size
=
self
.
type_vocab_size
,
is_decoder
=
False
,
initializer_range
=
self
.
initializer_range
)
initializer_range
=
self
.
initializer_range
)
return
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
return
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
def
prepare_config_and_inputs_for_decoder
(
self
):
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
=
self
.
prepare_config_and_inputs
()
config
.
is_decoder
=
True
encoder_hidden_states
=
floats_tensor
([
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
encoder_attention_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
return
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
,
encoder_hidden_states
,
encoder_attention_mask
def
check_loss_output
(
self
,
result
):
def
check_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
list
(
result
[
"loss"
].
size
()),
...
@@ -131,6 +144,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -131,6 +144,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertModel
(
config
=
config
)
model
=
BertModel
(
config
=
config
)
model
.
to
(
input_ids
.
device
)
model
.
eval
()
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
...
@@ -145,6 +159,21 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -145,6 +159,21 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"pooled_output"
].
size
()),
[
self
.
batch_size
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"pooled_output"
].
size
()),
[
self
.
batch_size
,
self
.
hidden_size
])
def
create_and_check_bert_model_as_decoder
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
,
encoder_hidden_states
,
encoder_attention_mask
):
model
=
BertModel
(
config
)
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
encoder_attention_mask
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
encoder_hidden_states
=
encoder_hidden_states
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
result
=
{
"sequence_output"
:
sequence_output
,
"pooled_output"
:
pooled_output
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"pooled_output"
].
size
()),
[
self
.
batch_size
,
self
.
hidden_size
])
def
create_and_check_bert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForMaskedLM
(
config
=
config
)
model
=
BertForMaskedLM
(
config
=
config
)
...
@@ -159,6 +188,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -159,6 +188,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_bert_model_for_masked_lm_as_decoder
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
,
encoder_hidden_states
,
encoder_attention_mask
):
model
=
BertForMaskedLM
(
config
=
config
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
encoder_attention_mask
)
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
,
encoder_hidden_states
=
encoder_hidden_states
)
result
=
{
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
self
.
check_loss_output
(
result
)
def
create_and_check_bert_for_next_sequence_prediction
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_next_sequence_prediction
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForNextSentencePrediction
(
config
=
config
)
model
=
BertForNextSentencePrediction
(
config
=
config
)
model
.
eval
()
model
.
eval
()
...
@@ -172,7 +215,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -172,7 +215,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
2
])
[
self
.
batch_size
,
2
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_bert_for_pretraining
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_pretraining
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForPreTraining
(
config
=
config
)
model
=
BertForPreTraining
(
config
=
config
)
model
.
eval
()
model
.
eval
()
...
@@ -191,7 +233,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -191,7 +233,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
2
])
[
self
.
batch_size
,
2
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_bert_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForQuestionAnswering
(
config
=
config
)
model
=
BertForQuestionAnswering
(
config
=
config
)
model
.
eval
()
model
.
eval
()
...
@@ -210,7 +251,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -210,7 +251,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
seq_length
])
[
self
.
batch_size
,
self
.
seq_length
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_bert_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
config
.
num_labels
=
self
.
num_labels
model
=
BertForSequenceClassification
(
config
)
model
=
BertForSequenceClassification
(
config
)
...
@@ -225,7 +265,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -225,7 +265,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
num_labels
])
[
self
.
batch_size
,
self
.
num_labels
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_bert_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
config
.
num_labels
=
self
.
num_labels
model
=
BertForTokenClassification
(
config
=
config
)
model
=
BertForTokenClassification
(
config
=
config
)
...
@@ -240,7 +279,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -240,7 +279,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
seq_length
,
self
.
num_labels
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
num_labels
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_bert_for_multiple_choice
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_multiple_choice
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_choices
=
self
.
num_choices
config
.
num_choices
=
self
.
num_choices
model
=
BertForMultipleChoice
(
config
=
config
)
model
=
BertForMultipleChoice
(
config
=
config
)
...
@@ -261,7 +299,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -261,7 +299,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
num_choices
])
[
self
.
batch_size
,
self
.
num_choices
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
prepare_config_and_inputs_for_common
(
self
):
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
...
@@ -276,14 +313,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -276,14 +313,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
test_config
(
self
):
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
self
.
config_tester
.
run_common_tests
()
def
test_bert_model
(
self
):
def
test_bert_model
(
self
,
use_cuda
=
False
):
# ^^ This could be a real fixture
if
use_cuda
:
self
.
model_tester
.
device
=
"cuda"
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_bert_model
(
*
config_and_inputs
)
self
.
model_tester
.
create_and_check_bert_model
(
*
config_and_inputs
)
def
test_bert_model_as_decoder
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs_for_decoder
()
self
.
model_tester
.
create_and_check_bert_model_as_decoder
(
*
config_and_inputs
)
def
test_for_masked_lm
(
self
):
def
test_for_masked_lm
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_bert_for_masked_lm
(
*
config_and_inputs
)
self
.
model_tester
.
create_and_check_bert_for_masked_lm
(
*
config_and_inputs
)
def
test_for_masked_lm_decoder
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs_for_decoder
()
self
.
model_tester
.
create_and_check_bert_model_for_masked_lm_as_decoder
(
*
config_and_inputs
)
def
test_for_multiple_choice
(
self
):
def
test_for_multiple_choice
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_bert_for_multiple_choice
(
*
config_and_inputs
)
self
.
model_tester
.
create_and_check_bert_for_multiple_choice
(
*
config_and_inputs
)
...
@@ -316,5 +364,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -316,5 +364,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
shutil
.
rmtree
(
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
transformers/tests/modeling_common_test.py
View file @
5340d1f2
...
@@ -35,7 +35,7 @@ if is_torch_available():
...
@@ -35,7 +35,7 @@ if is_torch_available():
import
torch
import
torch
import
numpy
as
np
import
numpy
as
np
from
transformers
import
(
PretrainedConfig
,
PreTrainedModel
,
from
transformers
import
(
AdaptiveEmbedding
,
PretrainedConfig
,
PreTrainedModel
,
BertModel
,
BertConfig
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
BertModel
,
BertConfig
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
GPT2LMHeadModel
,
GPT2Config
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
GPT2LMHeadModel
,
GPT2Config
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
else
:
...
@@ -463,6 +463,21 @@ class CommonTestCases:
...
@@ -463,6 +463,21 @@ class CommonTestCases:
self
.
assertTrue
(
models_equal
)
self
.
assertTrue
(
models_equal
)
def
test_model_common_attributes
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
self
.
assertIsInstance
(
model
.
get_input_embeddings
(),
(
torch
.
nn
.
Embedding
,
AdaptiveEmbedding
)
)
model
.
set_input_embeddings
(
torch
.
nn
.
Embedding
(
10
,
10
))
x
=
model
.
get_output_embeddings
()
self
.
assertTrue
(
x
is
None
or
isinstance
(
x
,
torch
.
nn
.
Linear
)
)
def
test_tie_model_weights
(
self
):
def
test_tie_model_weights
(
self
):
if
not
self
.
test_torchscript
:
if
not
self
.
test_torchscript
:
return
return
...
@@ -477,11 +492,11 @@ class CommonTestCases:
...
@@ -477,11 +492,11 @@ class CommonTestCases:
return
equal
return
equal
for
model_class
in
self
.
all_model_classes
:
for
model_class
in
self
.
all_model_classes
:
if
not
hasattr
(
model_class
,
'tie_weights'
):
continue
config
.
torchscript
=
True
config
.
torchscript
=
True
model_not_tied
=
model_class
(
config
)
model_not_tied
=
model_class
(
config
)
if
model_not_tied
.
get_output_embeddings
()
is
None
:
continue
params_not_tied
=
list
(
model_not_tied
.
parameters
())
params_not_tied
=
list
(
model_not_tied
.
parameters
())
config_tied
=
copy
.
deepcopy
(
config
)
config_tied
=
copy
.
deepcopy
(
config
)
...
@@ -516,6 +531,19 @@ class CommonTestCases:
...
@@ -516,6 +531,19 @@ class CommonTestCases:
# self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
# self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
# self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
# self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
def
test_inputs_embeds
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
eval
()
wte
=
model
.
get_input_embeddings
()
inputs_dict
[
"inputs_embeds"
]
=
wte
(
input_ids
)
outputs
=
model
(
**
inputs_dict
)
class
GPTModelTester
(
CommonModelTester
):
class
GPTModelTester
(
CommonModelTester
):
...
@@ -688,6 +716,7 @@ class CommonTestCases:
...
@@ -688,6 +716,7 @@ class CommonTestCases:
config_and_inputs
=
self
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
prepare_config_and_inputs
()
self
.
create_and_check_presents
(
*
config_and_inputs
)
self
.
create_and_check_presents
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
run_slow_tests
(
self
):
def
run_slow_tests
(
self
):
self
.
create_and_check_model_from_pretrained
()
self
.
create_and_check_model_from_pretrained
()
...
@@ -744,7 +773,24 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
...
@@ -744,7 +773,24 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
long
).
view
(
shape
).
contiguous
()
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
long
).
view
(
shape
).
contiguous
()
def
floats_tensor
(
shape
,
scale
=
1.0
,
rng
=
None
,
name
=
None
):
"""Creates a random float32 tensor of the shape within the vocab size."""
if
rng
is
None
:
rng
=
global_rng
total_dims
=
1
for
dim
in
shape
:
total_dims
*=
dim
values
=
[]
for
_
in
range
(
total_dims
):
values
.
append
(
rng
.
random
()
*
scale
)
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
float
).
view
(
shape
).
contiguous
()
class
ModelUtilsTest
(
unittest
.
TestCase
):
class
ModelUtilsTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
def
test_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_distilbert_test.py
View file @
5340d1f2
...
@@ -23,6 +23,7 @@ from transformers import is_torch_available
...
@@ -23,6 +23,7 @@ from transformers import is_torch_available
if
is_torch_available
():
if
is_torch_available
():
from
transformers
import
(
DistilBertConfig
,
DistilBertModel
,
DistilBertForMaskedLM
,
from
transformers
import
(
DistilBertConfig
,
DistilBertModel
,
DistilBertForMaskedLM
,
DistilBertForTokenClassification
,
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
)
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
)
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
...
@@ -180,6 +181,21 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
...
@@ -180,6 +181,21 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
num_labels
])
[
self
.
batch_size
,
self
.
num_labels
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_distilbert_for_token_classification
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
DistilBertForTokenClassification
(
config
=
config
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
labels
=
token_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
num_labels
])
self
.
check_loss_output
(
result
)
def
prepare_config_and_inputs_for_common
(
self
):
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
(
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
...
@@ -209,6 +225,10 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
...
@@ -209,6 +225,10 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_sequence_classification
(
*
config_and_inputs
)
self
.
model_tester
.
create_and_check_distilbert_for_sequence_classification
(
*
config_and_inputs
)
def
test_for_token_classification
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_token_classification
(
*
config_and_inputs
)
# @pytest.mark.slow
# @pytest.mark.slow
# def test_model_from_pretrained(self):
# def test_model_from_pretrained(self):
# cache_dir = "/tmp/transformers_test/"
# cache_dir = "/tmp/transformers_test/"
...
...
transformers/tests/modeling_encoder_decoder_test.py
0 → 100644
View file @
5340d1f2
# coding=utf-8
# Copyright 2018 The Hugging Face Inc. Team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
unittest
import
pytest
from
transformers
import
is_torch_available
if
is_torch_available
():
from
transformers
import
BertModel
,
BertForMaskedLM
,
Model2Model
from
transformers.modeling_bert
import
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
class
EncoderDecoderModelTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
def
test_model2model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
Model2Model
.
from_pretrained
(
model_name
)
self
.
assertIsInstance
(
model
.
encoder
,
BertModel
)
self
.
assertIsInstance
(
model
.
decoder
,
BertForMaskedLM
)
self
.
assertEqual
(
model
.
decoder
.
config
.
is_decoder
,
True
)
self
.
assertEqual
(
model
.
encoder
.
config
.
is_decoder
,
False
)
def
test_model2model_from_pretrained_not_bert
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
with
self
.
assertRaises
(
ValueError
):
_
=
Model2Model
.
from_pretrained
(
'roberta'
)
with
self
.
assertRaises
(
ValueError
):
_
=
Model2Model
.
from_pretrained
(
'distilbert'
)
with
self
.
assertRaises
(
ValueError
):
_
=
Model2Model
.
from_pretrained
(
'does-not-exist'
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_roberta_test.py
View file @
5340d1f2
...
@@ -24,7 +24,8 @@ from transformers import is_torch_available
...
@@ -24,7 +24,8 @@ from transformers import is_torch_available
if
is_torch_available
():
if
is_torch_available
():
import
torch
import
torch
from
transformers
import
(
RobertaConfig
,
RobertaModel
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
)
from
transformers
import
(
RobertaConfig
,
RobertaModel
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
,
RobertaForTokenClassification
)
from
transformers.modeling_roberta
import
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from
transformers.modeling_roberta
import
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
...
@@ -156,6 +157,22 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
...
@@ -156,6 +157,22 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
self
.
check_loss_output
(
result
)
self
.
check_loss_output
(
result
)
def
create_and_check_roberta_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
RobertaForTokenClassification
(
config
=
config
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
token_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
num_labels
])
self
.
check_loss_output
(
result
)
def
prepare_config_and_inputs_for_common
(
self
):
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
...
...
transformers/tests/modeling_tf_albert_test.py
0 → 100644
View file @
5340d1f2
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
transformers
import
AlbertConfig
,
is_tf_available
if
is_tf_available
():
import
tensorflow
as
tf
from
transformers.modeling_tf_albert
import
(
TFAlbertModel
,
TFAlbertForMaskedLM
,
TFAlbertForSequenceClassification
,
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
class
TFAlbertModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFAlbertModel
,
TFAlbertForMaskedLM
,
TFAlbertForSequenceClassification
)
if
is_tf_available
()
else
()
class
TFAlbertModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_input_mask
=
True
,
use_token_type_ids
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
embedding_size
=
16
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_input_mask
=
use_input_mask
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
embedding_size
=
embedding_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
(
[
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
AlbertConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
hidden_size
=
self
.
hidden_size
,
num_hidden_layers
=
self
.
num_hidden_layers
,
num_attention_heads
=
self
.
num_attention_heads
,
intermediate_size
=
self
.
intermediate_size
,
hidden_act
=
self
.
hidden_act
,
hidden_dropout_prob
=
self
.
hidden_dropout_prob
,
attention_probs_dropout_prob
=
self
.
attention_probs_dropout_prob
,
max_position_embeddings
=
self
.
max_position_embeddings
,
type_vocab_size
=
self
.
type_vocab_size
,
initializer_range
=
self
.
initializer_range
)
return
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
def
create_and_check_albert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
TFAlbertModel
(
config
=
config
)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
sequence_output
,
pooled_output
=
model
(
inputs
)
inputs
=
[
input_ids
,
input_mask
]
sequence_output
,
pooled_output
=
model
(
inputs
)
sequence_output
,
pooled_output
=
model
(
input_ids
)
result
=
{
"sequence_output"
:
sequence_output
.
numpy
(),
"pooled_output"
:
pooled_output
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"pooled_output"
].
shape
),
[
self
.
batch_size
,
self
.
hidden_size
])
def
create_and_check_albert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
TFAlbertForMaskedLM
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
prediction_scores
,
=
model
(
inputs
)
result
=
{
"prediction_scores"
:
prediction_scores
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
create_and_check_albert_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
TFAlbertForSequenceClassification
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
logits
,
=
model
(
inputs
)
result
=
{
"logits"
:
logits
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
shape
),
[
self
.
batch_size
,
self
.
num_labels
])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
TFAlbertModelTest
.
TFAlbertModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
AlbertConfig
,
hidden_size
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_albert_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_albert_model
(
*
config_and_inputs
)
def
test_for_masked_lm
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_albert_for_masked_lm
(
*
config_and_inputs
)
def
test_for_sequence_classification
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_albert_for_sequence_classification
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
# for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
for
model_name
in
[
'albert-base-uncased'
]:
model
=
TFAlbertModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_bert_test.py
View file @
5340d1f2
...
@@ -131,10 +131,6 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -131,10 +131,6 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
TFBertModel
(
config
=
config
)
model
=
TFBertModel
(
config
=
config
)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs)
inputs
=
{
'input_ids'
:
input_ids
,
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
'token_type_ids'
:
token_type_ids
}
...
...
transformers/tests/modeling_tf_common_test.py
View file @
5340d1f2
...
@@ -360,6 +360,16 @@ class TFCommonTestCases:
...
@@ -360,6 +360,16 @@ class TFCommonTestCases:
# self.assertTrue(models_equal)
# self.assertTrue(models_equal)
def
test_model_common_attributes
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
assert
isinstance
(
model
.
get_input_embeddings
(),
tf
.
keras
.
layers
.
Layer
)
x
=
model
.
get_output_embeddings
()
assert
x
is
None
or
isinstance
(
x
,
tf
.
keras
.
layers
.
Layer
)
def
test_tie_model_weights
(
self
):
def
test_tie_model_weights
(
self
):
pass
pass
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...
@@ -401,6 +411,35 @@ class TFCommonTestCases:
...
@@ -401,6 +411,35 @@ class TFCommonTestCases:
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
self
.
assertTrue
(
tf
.
math
.
equal
(
first
,
second
).
numpy
().
all
())
self
.
assertTrue
(
tf
.
math
.
equal
(
first
,
second
).
numpy
().
all
())
def
test_inputs_embeds
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
wte
=
model
.
get_input_embeddings
()
try
:
x
=
wte
(
input_ids
,
mode
=
"embedding"
)
except
:
try
:
x
=
wte
([
input_ids
],
mode
=
"embedding"
)
except
:
try
:
x
=
wte
([
input_ids
,
None
,
None
,
None
],
mode
=
"embedding"
)
except
:
if
hasattr
(
self
.
model_tester
,
"embedding_size"
):
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
embedding_size
],
dtype
=
tf
.
dtypes
.
float32
)
else
:
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
hidden_size
],
dtype
=
tf
.
dtypes
.
float32
)
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
#
inputs_dict
[
"inputs_embeds"
]
=
x
outputs
=
model
(
inputs_dict
)
def
ids_tensor
(
shape
,
vocab_size
,
rng
=
None
,
name
=
None
,
dtype
=
None
):
def
ids_tensor
(
shape
,
vocab_size
,
rng
=
None
,
name
=
None
,
dtype
=
None
):
"""Creates a random int32 tensor of the shape within the vocab size."""
"""Creates a random int32 tensor of the shape within the vocab size."""
...
...
transformers/tests/modeling_tf_roberta_test.py
View file @
5340d1f2
...
@@ -30,6 +30,7 @@ if is_tf_available():
...
@@ -30,6 +30,7 @@ if is_tf_available():
import
numpy
import
numpy
from
transformers.modeling_tf_roberta
import
(
TFRobertaModel
,
TFRobertaForMaskedLM
,
from
transformers.modeling_tf_roberta
import
(
TFRobertaModel
,
TFRobertaForMaskedLM
,
TFRobertaForSequenceClassification
,
TFRobertaForSequenceClassification
,
TFRobertaForTokenClassification
,
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
...
@@ -154,6 +155,20 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -154,6 +155,20 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
list
(
result
[
"prediction_scores"
].
shape
),
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
create_and_check_roberta_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
TFRobertaForTokenClassification
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
logits
,
=
model
(
inputs
)
result
=
{
"logits"
:
logits
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
num_labels
])
def
prepare_config_and_inputs_for_common
(
self
):
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
...
...
transformers/tests/optimization_test.py
View file @
5340d1f2
...
@@ -25,8 +25,12 @@ from transformers import is_torch_available
...
@@ -25,8 +25,12 @@ from transformers import is_torch_available
if
is_torch_available
():
if
is_torch_available
():
import
torch
import
torch
from
transformers
import
(
AdamW
,
ConstantLRSchedule
,
WarmupConstantSchedule
,
from
transformers
import
(
AdamW
,
WarmupCosineSchedule
,
WarmupCosineWithHardRestartsSchedule
,
WarmupLinearSchedule
)
get_constant_schedule
,
get_constant_schedule_with_warmup
,
get_cosine_schedule_with_warmup
,
get_cosine_with_hard_restarts_schedule_with_warmup
,
get_linear_schedule_with_warmup
)
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
...
@@ -87,59 +91,60 @@ class ScheduleInitTest(unittest.TestCase):
...
@@ -87,59 +91,60 @@ class ScheduleInitTest(unittest.TestCase):
self
.
assertAlmostEqual
(
a
,
b
,
delta
=
tol
)
self
.
assertAlmostEqual
(
a
,
b
,
delta
=
tol
)
def
test_constant_scheduler
(
self
):
def
test_constant_scheduler
(
self
):
scheduler
=
C
onstant
LRS
chedule
(
self
.
optimizer
)
scheduler
=
get_c
onstant
_s
chedule
(
self
.
optimizer
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
10.
]
*
self
.
num_steps
expected_learning_rates
=
[
10.
]
*
self
.
num_steps
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
C
onstant
LRS
chedule
(
self
.
optimizer
)
scheduler
=
get_c
onstant
_s
chedule
(
self
.
optimizer
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_constant_scheduler
(
self
):
def
test_warmup_constant_scheduler
(
self
):
scheduler
=
WarmupC
onstant
S
chedule
(
self
.
optimizer
,
warmup_steps
=
4
)
scheduler
=
get_c
onstant
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
4
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
2.5
,
5.0
,
7.5
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
]
expected_learning_rates
=
[
2.5
,
5.0
,
7.5
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
WarmupC
onstant
S
chedule
(
self
.
optimizer
,
warmup_steps
=
4
)
scheduler
=
get_c
onstant
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
4
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_linear_scheduler
(
self
):
def
test_warmup_linear_scheduler
(
self
):
scheduler
=
WarmupL
inear
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_l
inear
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.75
,
7.5
,
6.25
,
5.0
,
3.75
,
2.5
,
1.25
,
0.0
]
expected_learning_rates
=
[
5.0
,
10.0
,
8.75
,
7.5
,
6.25
,
5.0
,
3.75
,
2.5
,
1.25
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
WarmupL
inear
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_l
inear
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_cosine_scheduler
(
self
):
def
test_warmup_cosine_scheduler
(
self
):
scheduler
=
WarmupC
osine
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
9.61
,
8.53
,
6.91
,
5.0
,
3.08
,
1.46
,
0.38
,
0.0
]
expected_learning_rates
=
[
5.0
,
10.0
,
9.61
,
8.53
,
6.91
,
5.0
,
3.08
,
1.46
,
0.38
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
scheduler
=
WarmupC
osine
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_cosine_hard_restart_scheduler
(
self
):
def
test_warmup_cosine_hard_restart_scheduler
(
self
):
scheduler
=
WarmupC
osine
W
ith
H
ard
R
estarts
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
cycles
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_w
ith
_h
ard
_r
estarts
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_
cycles
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.53
,
5.0
,
1.46
,
10.0
,
8.53
,
5.0
,
1.46
,
0.0
]
expected_learning_rates
=
[
5.0
,
10.0
,
8.53
,
5.0
,
1.46
,
10.0
,
8.53
,
5.0
,
1.46
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
scheduler
=
WarmupC
osine
W
ith
H
ard
R
estarts
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
cycles
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_w
ith
_h
ard
_r
estarts
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_
cycles
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
transformers/tests/tokenization_albert_test.py
0 → 100644
View file @
5340d1f2
# coding=utf-8
# Copyright 2019 Hugging Face inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_albert
import
(
AlbertTokenizer
,
SPIECE_UNDERLINE
)
from
.tokenization_tests_commons
import
CommonTestCases
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'fixtures/spiece.model'
)
class
AlbertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
AlbertTokenizer
def
setUp
(
self
):
super
(
AlbertTokenizationTest
,
self
).
setUp
()
# We have a SentencePiece fixture for testing
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
AlbertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"this is a test"
output_text
=
u
"this is a test"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
,
keep_accents
=
True
)
tokens
=
tokenizer
.
tokenize
(
u
'This is a test'
)
self
.
assertListEqual
(
tokens
,
[
u
'▁this'
,
u
'▁is'
,
u
'▁a'
,
u
'▁test'
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
48
,
25
,
21
,
1289
])
tokens
=
tokenizer
.
tokenize
(
u
"I was born in 92000, and this is falsé."
)
self
.
assertListEqual
(
tokens
,
[
u
'▁i'
,
u
'▁was'
,
u
'▁born'
,
u
'▁in'
,
u
'▁9'
,
u
'2000'
,
u
','
,
u
'▁and'
,
u
'▁this'
,
u
'▁is'
,
u
'▁fal'
,
u
's'
,
u
'é'
,
u
'.'
])
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
self
.
assertListEqual
(
ids
,
[
31
,
23
,
386
,
19
,
561
,
3050
,
15
,
17
,
48
,
25
,
8256
,
18
,
1
,
9
])
back_tokens
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
self
.
assertListEqual
(
back_tokens
,
[
'▁i'
,
'▁was'
,
'▁born'
,
'▁in'
,
'▁9'
,
'2000'
,
','
,
'▁and'
,
'▁this'
,
'▁is'
,
'▁fal'
,
's'
,
'<unk>'
,
'.'
])
def
test_sequence_builders
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
assert
encoded_pair
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
+
text_2
+
[
tokenizer
.
sep_token_id
]
if
__name__
==
'__main__'
:
unittest
.
main
()
transformers/tests/tokenization_auto_test.py
View file @
5340d1f2
...
@@ -26,6 +26,7 @@ from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CON
...
@@ -26,6 +26,7 @@ from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CON
class
AutoTokenizerTest
(
unittest
.
TestCase
):
class
AutoTokenizerTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
def
test_tokenizer_from_pretrained
(
self
):
def
test_tokenizer_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/tokenization_bert_test.py
View file @
5340d1f2
...
@@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
os
import
unittest
import
unittest
import
pytest
from
io
import
open
from
io
import
open
from
transformers.tokenization_bert
import
(
BasicTokenizer
,
from
transformers.tokenization_bert
import
(
BasicTokenizer
,
...
@@ -125,11 +126,12 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -125,11 +126,12 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
self
.
assertFalse
(
_is_punctuation
(
u
"A"
))
self
.
assertFalse
(
_is_punctuation
(
u
"A"
))
self
.
assertFalse
(
_is_punctuation
(
u
" "
))
self
.
assertFalse
(
_is_punctuation
(
u
" "
))
@
pytest
.
mark
.
slow
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_distilbert_test.py
View file @
5340d1f2
...
@@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
os
import
unittest
import
unittest
import
pytest
from
io
import
open
from
io
import
open
from
transformers.tokenization_distilbert
import
(
DistilBertTokenizer
)
from
transformers.tokenization_distilbert
import
(
DistilBertTokenizer
)
...
@@ -30,11 +31,12 @@ class DistilBertTokenizationTest(BertTokenizationTest):
...
@@ -30,11 +31,12 @@ class DistilBertTokenizationTest(BertTokenizationTest):
def
get_tokenizer
(
self
,
**
kwargs
):
def
get_tokenizer
(
self
,
**
kwargs
):
return
DistilBertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
return
DistilBertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
@
pytest
.
mark
.
slow
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_roberta_test.py
View file @
5340d1f2
...
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
os
import
json
import
json
import
unittest
import
unittest
import
pytest
from
io
import
open
from
io
import
open
from
transformers.tokenization_roberta
import
RobertaTokenizer
,
VOCAB_FILES_NAMES
from
transformers.tokenization_roberta
import
RobertaTokenizer
,
VOCAB_FILES_NAMES
...
@@ -70,19 +71,20 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -70,19 +71,20 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world!'
),
tokenizer
.
encode
(
'Hello world!'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
2
]
[
0
,
31414
,
232
,
328
,
2
]
)
)
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
),
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
)
)
@
pytest
.
mark
.
slow
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
)
tokenizer
=
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
...
...
transformers/tests/tokenization_tests_commons.py
View file @
5340d1f2
...
@@ -79,13 +79,13 @@ class CommonTestCases:
...
@@ -79,13 +79,13 @@ class CommonTestCases:
# Now let's start the test
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
with
TemporaryDirectory
()
as
tmpdirname
:
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
...
@@ -110,6 +110,36 @@ class CommonTestCases:
...
@@ -110,6 +110,36 @@ class CommonTestCases:
self
.
assertListEqual
(
subwords
,
subwords_loaded
)
self
.
assertListEqual
(
subwords
,
subwords_loaded
)
def
test_added_tokens_do_lower_case
(
self
):
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
True
)
text
=
"aaaaa bbbbbb low cccccccccdddddddd l"
text2
=
"AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l"
toks0
=
tokenizer
.
tokenize
(
text
)
# toks before adding new_toks
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
,
'AAAAA BBBBBB'
,
'CCCCCCCCCDDDDDDDD'
]
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
2
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
# toks0 should be longer
self
.
assertListEqual
(
toks
,
toks2
)
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
False
)
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
4
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
# Length should still be the same
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
self
.
assertNotEqual
(
toks
[
0
],
toks2
[
0
])
# But at least the first tokens should differ
def
test_add_tokens_tokenizer
(
self
):
def
test_add_tokens_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
...
@@ -130,7 +160,7 @@ class CommonTestCases:
...
@@ -130,7 +160,7 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
)
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
...
@@ -148,7 +178,8 @@ class CommonTestCases:
...
@@ -148,7 +178,8 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
)
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
...
@@ -159,6 +190,26 @@ class CommonTestCases:
...
@@ -159,6 +190,26 @@ class CommonTestCases:
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
def
test_add_special_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
special_token
=
"[SPECIAL TOKEN]"
tokenizer
.
add_special_tokens
({
"cls_token"
:
special_token
})
encoded_special_token
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
len
(
encoded_special_token
)
==
1
text
=
" "
.
join
([
input_text
,
special_token
,
output_text
])
encoded
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
input_encoded
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
output_encoded
=
tokenizer
.
encode
(
output_text
,
add_special_tokens
=
False
)
special_token_id
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
encoded
==
input_encoded
+
special_token_id
+
output_encoded
decoded
=
tokenizer
.
decode
(
encoded
,
skip_special_tokens
=
True
)
assert
special_token
not
in
decoded
def
test_required_methods_tokenizer
(
self
):
def
test_required_methods_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
...
@@ -166,7 +217,7 @@ class CommonTestCases:
...
@@ -166,7 +217,7 @@ class CommonTestCases:
tokens
=
tokenizer
.
tokenize
(
input_text
)
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
...
@@ -206,7 +257,7 @@ class CommonTestCases:
...
@@ -206,7 +257,7 @@ class CommonTestCases:
seq_0
=
"Test this method."
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
seq_1
=
"With these inputs."
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
)
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
# Method is implemented (e.g. not GPT-2)
# Method is implemented (e.g. not GPT-2)
...
@@ -219,7 +270,7 @@ class CommonTestCases:
...
@@ -219,7 +270,7 @@ class CommonTestCases:
seq_0
=
"This is a sentence to be encoded."
seq_0
=
"This is a sentence to be encoded."
stride
=
2
stride
=
2
sequence
=
tokenizer
.
encode
(
seq_0
)
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
...
@@ -239,13 +290,13 @@ class CommonTestCases:
...
@@ -239,13 +290,13 @@ class CommonTestCases:
seq_1
=
"This is another sentence to be encoded."
seq_1
=
"This is another sentence to be encoded."
stride
=
2
stride
=
2
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
)
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
),
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
)[:
-
2
]
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
]
)
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
...
@@ -283,7 +334,7 @@ class CommonTestCases:
...
@@ -283,7 +334,7 @@ class CommonTestCases:
sequence_1
=
"This one too please."
sequence_1
=
"This one too please."
# Testing single inputs
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
@@ -294,7 +345,8 @@ class CommonTestCases:
...
@@ -294,7 +345,8 @@ class CommonTestCases:
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
+
tokenizer
.
encode
(
sequence_1
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
sequence_1
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
...
transformers/tests/tokenization_utils_test.py
View file @
5340d1f2
...
@@ -18,11 +18,13 @@ from __future__ import print_function
...
@@ -18,11 +18,13 @@ from __future__ import print_function
import
unittest
import
unittest
import
six
import
six
import
pytest
from
transformers
import
PreTrainedTokenizer
from
transformers
import
PreTrainedTokenizer
from
transformers.tokenization_gpt2
import
GPT2Tokenizer
from
transformers.tokenization_gpt2
import
GPT2Tokenizer
class
TokenizerUtilsTest
(
unittest
.
TestCase
):
class
TokenizerUtilsTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
def
check_tokenizer_from_pretrained
(
self
,
tokenizer_class
):
def
check_tokenizer_from_pretrained
(
self
,
tokenizer_class
):
s3_models
=
list
(
tokenizer_class
.
max_model_input_sizes
.
keys
())
s3_models
=
list
(
tokenizer_class
.
max_model_input_sizes
.
keys
())
for
model_name
in
s3_models
[:
1
]:
for
model_name
in
s3_models
[:
1
]:
...
...
transformers/tests/tokenization_xlm_test.py
View file @
5340d1f2
...
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
os
import
unittest
import
unittest
import
json
import
json
import
pytest
from
transformers.tokenization_xlm
import
XLMTokenizer
,
VOCAB_FILES_NAMES
from
transformers.tokenization_xlm
import
XLMTokenizer
,
VOCAB_FILES_NAMES
...
@@ -66,11 +67,12 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -66,11 +67,12 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
@
pytest
.
mark
.
slow
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
XLMTokenizer
.
from_pretrained
(
"xlm-mlm-en-2048"
)
tokenizer
=
XLMTokenizer
.
from_pretrained
(
"xlm-mlm-en-2048"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_xlnet_test.py
View file @
5340d1f2
...
@@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
os
import
unittest
import
unittest
import
pytest
from
transformers.tokenization_xlnet
import
(
XLNetTokenizer
,
SPIECE_UNDERLINE
)
from
transformers.tokenization_xlnet
import
(
XLNetTokenizer
,
SPIECE_UNDERLINE
)
...
@@ -89,11 +90,12 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -89,11 +90,12 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
u
'9'
,
u
'2'
,
u
'0'
,
u
'0'
,
u
'0'
,
u
','
,
SPIECE_UNDERLINE
+
u
'and'
,
SPIECE_UNDERLINE
+
u
'this'
,
u
'9'
,
u
'2'
,
u
'0'
,
u
'0'
,
u
'0'
,
u
','
,
SPIECE_UNDERLINE
+
u
'and'
,
SPIECE_UNDERLINE
+
u
'this'
,
SPIECE_UNDERLINE
+
u
'is'
,
SPIECE_UNDERLINE
+
u
'f'
,
u
'al'
,
u
'se'
,
u
'.'
])
SPIECE_UNDERLINE
+
u
'is'
,
SPIECE_UNDERLINE
+
u
'f'
,
u
'al'
,
u
'se'
,
u
'.'
])
@
pytest
.
mark
.
slow
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment