Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
fbd746bd
Commit
fbd746bd
authored
Aug 08, 2019
by
LysandreJik
Browse files
Updated test architecture
parent
6c41a8f5
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
91 additions
and
27 deletions
+91
-27
pytorch_transformers/tests/modeling_roberta_test.py
pytorch_transformers/tests/modeling_roberta_test.py
+40
-3
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+48
-22
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+3
-2
No files found.
pytorch_transformers/tests/modeling_roberta_test.py
View file @
fbd746bd
...
@@ -19,8 +19,9 @@ from __future__ import print_function
...
@@ -19,8 +19,9 @@ from __future__ import print_function
import
unittest
import
unittest
import
shutil
import
shutil
import
pytest
import
pytest
import
torch
from
pytorch_transformers
import
(
RobertaConfig
,
RobertaModel
,
RobertaForMaskedLM
)
from
pytorch_transformers
import
(
RobertaConfig
,
RobertaModel
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
)
from
pytorch_transformers.modeling_roberta
import
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from
pytorch_transformers.modeling_roberta
import
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_common_test
import
(
CommonTestCases
,
ConfigTester
,
ids_tensor
)
from
.modeling_common_test
import
(
CommonTestCases
,
ConfigTester
,
ids_tensor
)
...
@@ -156,6 +157,42 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
...
@@ -156,6 +157,42 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
return
config
,
inputs_dict
def
test_inference_masked_lm
(
self
):
model
=
RobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
input_ids
=
torch
.
tensor
([[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]])
output
=
model
(
input_ids
)[
0
]
expected_shape
=
torch
.
Size
((
1
,
11
,
50265
))
self
.
assertEqual
(
output
.
shape
,
expected_shape
)
# compare the actual values for a slice.
expected_slice
=
torch
.
Tensor
(
[[[
33.8843
,
-
4.3107
,
22.7779
],
[
4.6533
,
-
2.8099
,
13.6252
],
[
1.8222
,
-
3.6898
,
8.8600
]]]
)
self
.
assertTrue
(
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
)
# @pytest.mark.slow
def
test_inference_no_head
(
self
):
model
=
RobertaModel
.
from_pretrained
(
'roberta-base'
)
input_ids
=
torch
.
tensor
([[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]])
output
=
model
(
input_ids
)[
0
]
# compare the actual values for a slice.
expected_slice
=
torch
.
Tensor
(
[[[
-
0.0231
,
0.0782
,
0.0074
],
[
-
0.1854
,
0.0539
,
-
0.0174
],
[
0.0548
,
0.0799
,
0.1687
]]]
)
self
.
assertTrue
(
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
)
def
setUp
(
self
):
def
setUp
(
self
):
self
.
model_tester
=
RobertaModelTest
.
RobertaModelTester
(
self
)
self
.
model_tester
=
RobertaModelTest
.
RobertaModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
RobertaConfig
,
hidden_size
=
37
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
RobertaConfig
,
hidden_size
=
37
)
...
@@ -183,7 +220,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
...
@@ -183,7 +220,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
class
RobertaModelIntegrationTest
(
unittest
.
TestCase
):
class
RobertaModelIntegrationTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
#
@pytest.mark.slow
def
test_inference_masked_lm
(
self
):
def
test_inference_masked_lm
(
self
):
model
=
RobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
model
=
RobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
...
@@ -204,7 +241,7 @@ class RobertaModelIntegrationTest(unittest.TestCase):
...
@@ -204,7 +241,7 @@ class RobertaModelIntegrationTest(unittest.TestCase):
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
)
)
@
pytest
.
mark
.
slow
#
@pytest.mark.slow
def
test_inference_no_head
(
self
):
def
test_inference_no_head
(
self
):
model
=
RobertaModel
.
from_pretrained
(
'roberta-base'
)
model
=
RobertaModel
.
from_pretrained
(
'roberta-base'
)
...
...
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
fbd746bd
...
@@ -15,43 +15,69 @@
...
@@ -15,43 +15,69 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
json
import
unittest
import
unittest
from
pytorch_transformers.tokenization_roberta
import
RobertaTokenizer
,
VOCAB_FILES_NAMES
from
pytorch_transformers.tokenization_roberta
import
RobertaTokenizer
,
DICT_FILES_NAMES
from
.tokenization_tests_commons
import
create_and_check_tokenizer_commons
,
TemporaryDirectory
from
pytorch_transformers.tokenization_gpt2
import
GPT2Tokenizer
,
VOCAB_FILES_NAMES
from
.tokenization_tests_commons
import
CommonTestCases
class
RobertaTokenizationTest
(
unittest
.
TestCase
):
class
RobertaTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
RobertaTokenizer
def
test_full_tokenizer
(
self
):
def
setUp
(
self
):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
super
(
RobertaTokenizationTest
,
self
).
setUp
()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
"lo"
,
"low"
,
"er"
,
"lo"
,
"low"
,
"er"
,
"low"
,
"lowest"
,
"newer"
,
"wider"
,
"<unk>"
]
"low"
,
"lowest"
,
"newer"
,
"wider"
,
"<unk>"
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
special_tokens_map
=
{
"unk_token"
:
"<unk>"
}
merges
=
[
"#version: 0.2"
,
"l o"
,
"lo w"
,
"e r"
,
""
]
self
.
special_tokens_map
=
{
"unk_token"
:
"<unk>"
}
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
self
.
merges_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
self
.
vocab_file
,
"w"
)
as
fp
:
fp
.
write
(
json
.
dumps
(
vocab_tokens
))
with
open
(
self
.
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
with
TemporaryDirectory
()
as
tmpdirname
:
def
get_tokenizer
(
self
):
vocab_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
bpe_tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
self
.
special_tokens_map
)
with
open
(
vocab_file
,
"w"
)
as
fp
:
return
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
,
bpe_tokenizer
=
bpe_tokenizer
)
[
fp
.
write
(
f
"
{
vocab
}
{
index
}
\n
"
)
for
index
,
vocab
in
enumerate
(
vocab_tokens
)]
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
input_text
=
u
"lower newer"
output_text
=
u
"lower<unk>newer"
output_text
=
u
"lower<unk>newer"
return
input_text
,
output_text
create_and_check_tokenizer_commons
(
self
,
input_text
,
output_text
,
RobertaTokenizer
,
tmpdirname
,
**
special_tokens_map
)
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
RobertaTokenizer
(
vocab_file
,
**
special_tokens_map
)
text
=
"lower"
text
=
"lower"
bpe_tokens
=
[
"low"
,
"er"
]
bpe_tokens
=
[
"low"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_bpe_tokens
=
[
13
,
12
,
17
]
input_bpe_tokens
=
[
0
,
4
,
12
,
176
,
2
]
tokenizer
.
convert_tokens_to_ids
(
input_tokens
)
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
def
roberta_dict_integration_testing
(
self
):
tokenizer
=
self
.
get_tokenizer
()
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world!'
),
[
0
,
31414
,
232
,
328
,
2
]
)
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world! cécé herlolip'
),
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
fbd746bd
...
@@ -105,7 +105,7 @@ class CommonTestCases:
...
@@ -105,7 +105,7 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaabbbbbb low cccccccccdddddddd l"
)
tokens
=
tokenizer
.
encode
(
"aaaaabbbbbb low cccccccccdddddddd l"
,
no_sep_cls_tokens
=
True
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
...
@@ -121,7 +121,8 @@ class CommonTestCases:
...
@@ -121,7 +121,8 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
)
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
no_sep_cls_tokens
=
True
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment