Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a09da4ee
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "ef2dcdccaa9a115aca44d81f31c6dc4d32bebb3f"
Commit
a09da4ee
authored
Nov 29, 2019
by
Masatoshi Suzuki
Committed by
Julien Chaumond
Dec 11, 2019
Browse files
Add a test for Japanese BERT tokenizers
parent
57b5cb3e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
192 additions
and
0 deletions
+192
-0
transformers/tests/tokenization_bert_japanese_test.py
transformers/tests/tokenization_bert_japanese_test.py
+192
-0
No files found.
transformers/tests/tokenization_bert_japanese_test.py
0 → 100644
View file @
a09da4ee
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
import
pytest
from
io
import
open
from
transformers.tokenization_bert
import
WordpieceTokenizer
from
transformers.tokenization_bert_japanese
import
(
BertJapaneseTokenizer
,
MecabTokenizer
,
CharacterTokenizer
,
VOCAB_FILES_NAMES
)
from
.tokenization_tests_commons
import
CommonTestCases
class
BertJapaneseTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
,
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
,
u
"世界"
,
u
"##世界"
,
u
"、"
,
u
"##、"
,
u
"。"
,
u
"##。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こんにちは 、 世界 。 こんばんは 、 世界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こんにちは"
,
u
"、"
,
u
"世界"
,
u
"。"
,
u
"こん"
,
u
"##ばんは"
,
u
"、"
,
u
"世界"
,
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
12
,
10
,
14
,
4
,
9
,
12
,
10
,
14
])
def
test_mecab_tokenizer
(
self
):
tokenizer
=
MecabTokenizer
()
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_lower
(
self
):
tokenizer
=
MecabTokenizer
(
do_lower_case
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iphone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_no_normalize
(
self
):
tokenizer
=
MecabTokenizer
(
normalize_text
=
False
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
" "
,
u
"。"
])
def
test_wordpiece_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
WordpieceTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こんにちは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは"
),
[
u
"こん"
,
u
"##ばんは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは こんばんにちは こんにちは"
),
[
u
"こん"
,
u
"##ばんは"
,
u
"[UNK]"
,
u
"こんにちは"
])
@
pytest
.
mark
.
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
class
BertJapaneseCharacterTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseCharacterTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
,
u
"、"
,
u
"。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
subword_tokenizer_type
=
"character"
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
,
subword_tokenizer_type
=
"character"
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
,
u
"こ"
,
u
"ん"
,
u
"ば"
,
u
"ん"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
4
,
5
,
6
,
7
,
11
,
9
,
10
,
12
,
3
,
4
,
8
,
4
,
7
,
11
,
9
,
10
,
12
])
def
test_character_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
u
"、"
,
u
"。"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
CharacterTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちほ"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"[UNK]"
])
@
pytest
.
mark
.
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese-char"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
if
__name__
==
'__main__'
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment