Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
fbd746bd
Commit
fbd746bd
authored
Aug 08, 2019
by
LysandreJik
Browse files
Updated test architecture
parent
6c41a8f5
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
91 additions
and
27 deletions
+91
-27
pytorch_transformers/tests/modeling_roberta_test.py
pytorch_transformers/tests/modeling_roberta_test.py
+40
-3
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+48
-22
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+3
-2
No files found.
pytorch_transformers/tests/modeling_roberta_test.py
View file @
fbd746bd
...
@@ -19,8 +19,9 @@ from __future__ import print_function
...
@@ -19,8 +19,9 @@ from __future__ import print_function
import
unittest
import
unittest
import
shutil
import
shutil
import
pytest
import
pytest
import
torch
from
pytorch_transformers
import
(
RobertaConfig
,
RobertaModel
,
RobertaForMaskedLM
)
from
pytorch_transformers
import
(
RobertaConfig
,
RobertaModel
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
)
from
pytorch_transformers.modeling_roberta
import
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from
pytorch_transformers.modeling_roberta
import
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_common_test
import
(
CommonTestCases
,
ConfigTester
,
ids_tensor
)
from
.modeling_common_test
import
(
CommonTestCases
,
ConfigTester
,
ids_tensor
)
...
@@ -156,6 +157,42 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
...
@@ -156,6 +157,42 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
return
config
,
inputs_dict
def
test_inference_masked_lm
(
self
):
model
=
RobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
input_ids
=
torch
.
tensor
([[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]])
output
=
model
(
input_ids
)[
0
]
expected_shape
=
torch
.
Size
((
1
,
11
,
50265
))
self
.
assertEqual
(
output
.
shape
,
expected_shape
)
# compare the actual values for a slice.
expected_slice
=
torch
.
Tensor
(
[[[
33.8843
,
-
4.3107
,
22.7779
],
[
4.6533
,
-
2.8099
,
13.6252
],
[
1.8222
,
-
3.6898
,
8.8600
]]]
)
self
.
assertTrue
(
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
)
# @pytest.mark.slow
def
test_inference_no_head
(
self
):
model
=
RobertaModel
.
from_pretrained
(
'roberta-base'
)
input_ids
=
torch
.
tensor
([[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]])
output
=
model
(
input_ids
)[
0
]
# compare the actual values for a slice.
expected_slice
=
torch
.
Tensor
(
[[[
-
0.0231
,
0.0782
,
0.0074
],
[
-
0.1854
,
0.0539
,
-
0.0174
],
[
0.0548
,
0.0799
,
0.1687
]]]
)
self
.
assertTrue
(
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
)
def
setUp
(
self
):
def
setUp
(
self
):
self
.
model_tester
=
RobertaModelTest
.
RobertaModelTester
(
self
)
self
.
model_tester
=
RobertaModelTest
.
RobertaModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
RobertaConfig
,
hidden_size
=
37
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
RobertaConfig
,
hidden_size
=
37
)
...
@@ -183,7 +220,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
...
@@ -183,7 +220,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
class
RobertaModelIntegrationTest
(
unittest
.
TestCase
):
class
RobertaModelIntegrationTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
#
@pytest.mark.slow
def
test_inference_masked_lm
(
self
):
def
test_inference_masked_lm
(
self
):
model
=
RobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
model
=
RobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
...
@@ -204,7 +241,7 @@ class RobertaModelIntegrationTest(unittest.TestCase):
...
@@ -204,7 +241,7 @@ class RobertaModelIntegrationTest(unittest.TestCase):
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
)
)
@
pytest
.
mark
.
slow
#
@pytest.mark.slow
def
test_inference_no_head
(
self
):
def
test_inference_no_head
(
self
):
model
=
RobertaModel
.
from_pretrained
(
'roberta-base'
)
model
=
RobertaModel
.
from_pretrained
(
'roberta-base'
)
...
...
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
fbd746bd
...
@@ -15,43 +15,69 @@
...
@@ -15,43 +15,69 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
json
import
unittest
import
unittest
from
pytorch_transformers.tokenization_roberta
import
RobertaTokenizer
,
VOCAB_FILES_NAMES
from
pytorch_transformers.tokenization_roberta
import
RobertaTokenizer
,
DICT_FILES_NAMES
from
.tokenization_tests_commons
import
create_and_check_tokenizer_commons
,
TemporaryDirectory
from
pytorch_transformers.tokenization_gpt2
import
GPT2Tokenizer
,
VOCAB_FILES_NAMES
from
.tokenization_tests_commons
import
CommonTestCases
class
RobertaTokenizationTest
(
unittest
.
TestCase
):
class
RobertaTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
RobertaTokenizer
def
test_full_tokenizer
(
self
):
def
setUp
(
self
):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
super
(
RobertaTokenizationTest
,
self
).
setUp
()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
"lo"
,
"low"
,
"er"
,
"lo"
,
"low"
,
"er"
,
"low"
,
"lowest"
,
"newer"
,
"wider"
,
"<unk>"
]
"low"
,
"lowest"
,
"newer"
,
"wider"
,
"<unk>"
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
special_tokens_map
=
{
"unk_token"
:
"<unk>"
}
merges
=
[
"#version: 0.2"
,
"l o"
,
"lo w"
,
"e r"
,
""
]
self
.
special_tokens_map
=
{
"unk_token"
:
"<unk>"
}
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
self
.
merges_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
self
.
vocab_file
,
"w"
)
as
fp
:
fp
.
write
(
json
.
dumps
(
vocab_tokens
))
with
open
(
self
.
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
with
TemporaryDirectory
()
as
tmpdirname
:
def
get_tokenizer
(
self
):
vocab_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
bpe_tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
self
.
special_tokens_map
)
with
open
(
vocab_file
,
"w"
)
as
fp
:
return
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
,
bpe_tokenizer
=
bpe_tokenizer
)
[
fp
.
write
(
f
"
{
vocab
}
{
index
}
\n
"
)
for
index
,
vocab
in
enumerate
(
vocab_tokens
)]
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
input_text
=
u
"lower newer"
output_text
=
u
"lower<unk>newer"
output_text
=
u
"lower<unk>newer"
return
input_text
,
output_text
create_and_check_tokenizer_commons
(
self
,
input_text
,
output_text
,
RobertaTokenizer
,
tmpdirname
,
**
special_tokens_map
)
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
RobertaTokenizer
(
vocab_file
,
**
special_tokens_map
)
text
=
"lower"
text
=
"lower"
bpe_tokens
=
[
"low"
,
"er"
]
bpe_tokens
=
[
"low"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_bpe_tokens
=
[
13
,
12
,
17
]
input_bpe_tokens
=
[
0
,
4
,
12
,
176
,
2
]
tokenizer
.
convert_tokens_to_ids
(
input_tokens
)
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
def
roberta_dict_integration_testing
(
self
):
tokenizer
=
self
.
get_tokenizer
()
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world!'
),
[
0
,
31414
,
232
,
328
,
2
]
)
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world! cécé herlolip'
),
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
fbd746bd
...
@@ -105,7 +105,7 @@ class CommonTestCases:
...
@@ -105,7 +105,7 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaabbbbbb low cccccccccdddddddd l"
)
tokens
=
tokenizer
.
encode
(
"aaaaabbbbbb low cccccccccdddddddd l"
,
no_sep_cls_tokens
=
True
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
...
@@ -121,7 +121,8 @@ class CommonTestCases:
...
@@ -121,7 +121,8 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
)
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
no_sep_cls_tokens
=
True
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment