Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
f2a337b3
Commit
f2a337b3
authored
Sep 26, 2019
by
thomwolf
Browse files
fix tokenization tests for gpt2 roberta
parent
7a99e4b1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
4 deletions
+4
-4
pytorch_transformers/tests/tokenization_gpt2_test.py
pytorch_transformers/tests/tokenization_gpt2_test.py
+2
-2
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+2
-2
No files found.
pytorch_transformers/tests/tokenization_gpt2_test.py
View file @
f2a337b3
...
@@ -52,14 +52,14 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -52,14 +52,14 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
def
get_input_output_texts
(
self
):
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
input_text
=
u
"lower newer"
output_text
=
u
"
lower newer"
output_text
=
u
"lower newer"
return
input_text
,
output_text
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
def
test_full_tokenizer
(
self
):
tokenizer
=
GPT2Tokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
tokenizer
=
GPT2Tokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
text
=
"lower newer"
text
=
"lower newer"
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
tokens
=
tokenizer
.
tokenize
(
text
,
add_prefix_space
=
True
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
...
...
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
f2a337b3
...
@@ -51,14 +51,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -51,14 +51,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
get_input_output_texts
(
self
):
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
input_text
=
u
"lower newer"
output_text
=
u
"
lower newer"
output_text
=
u
"lower newer"
return
input_text
,
output_text
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
def
test_full_tokenizer
(
self
):
tokenizer
=
RobertaTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
tokenizer
=
RobertaTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
text
=
"lower newer"
text
=
"lower newer"
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
tokens
=
tokenizer
.
tokenize
(
text
,
add_prefix_space
=
True
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment