Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
f2a337b3
Commit
f2a337b3
authored
Sep 26, 2019
by
thomwolf
Browse files
fix tokenization tests for gpt2 roberta
parent
7a99e4b1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
4 deletions
+4
-4
pytorch_transformers/tests/tokenization_gpt2_test.py
pytorch_transformers/tests/tokenization_gpt2_test.py
+2
-2
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+2
-2
No files found.
pytorch_transformers/tests/tokenization_gpt2_test.py
View file @
f2a337b3
...
@@ -52,14 +52,14 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -52,14 +52,14 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
def
get_input_output_texts
(
self
):
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
input_text
=
u
"lower newer"
output_text
=
u
"
lower newer"
output_text
=
u
"lower newer"
return
input_text
,
output_text
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
def
test_full_tokenizer
(
self
):
tokenizer
=
GPT2Tokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
tokenizer
=
GPT2Tokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
text
=
"lower newer"
text
=
"lower newer"
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
tokens
=
tokenizer
.
tokenize
(
text
,
add_prefix_space
=
True
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
...
...
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
f2a337b3
...
@@ -51,14 +51,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -51,14 +51,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
get_input_output_texts
(
self
):
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
input_text
=
u
"lower newer"
output_text
=
u
"
lower newer"
output_text
=
u
"lower newer"
return
input_text
,
output_text
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
def
test_full_tokenizer
(
self
):
tokenizer
=
RobertaTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
tokenizer
=
RobertaTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
text
=
"lower newer"
text
=
"lower newer"
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
bpe_tokens
=
[
"
\u0120
low"
,
"er"
,
"
\u0120
"
,
"n"
,
"e"
,
"w"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
tokens
=
tokenizer
.
tokenize
(
text
,
add_prefix_space
=
True
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment