Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
48930a4c
Unverified
Commit
48930a4c
authored
Nov 10, 2018
by
Thomas Wolf
Committed by
GitHub
Nov 10, 2018
Browse files
Merge pull request #2 from elyase/patch-1
Port tokenization for the multilingual model
parents
a81a1ef8
4d124baf
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
49 additions
and
0 deletions
+49
-0
tests/tokenization_test.py
tests/tokenization_test.py
+7
-0
tokenization.py
tokenization.py
+42
-0
No files found.
tests/tokenization_test.py
View file @
48930a4c
...
@@ -43,6 +43,13 @@ class TokenizationTest(unittest.TestCase):
...
@@ -43,6 +43,13 @@ class TokenizationTest(unittest.TestCase):
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
7
,
4
,
5
,
10
,
8
,
9
])
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
7
,
4
,
5
,
10
,
8
,
9
])
def
test_chinese
(
self
):
tokenizer
=
tokenization
.
BasicTokenizer
()
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"ah
\u535A\u63A8
zz"
),
[
u
"ah"
,
u
"
\u535A
"
,
u
"
\u63A8
"
,
u
"zz"
])
def
test_basic_tokenizer_lower
(
self
):
def
test_basic_tokenizer_lower
(
self
):
tokenizer
=
tokenization
.
BasicTokenizer
(
do_lower_case
=
True
)
tokenizer
=
tokenization
.
BasicTokenizer
(
do_lower_case
=
True
)
...
...
tokenization.py
View file @
48930a4c
...
@@ -133,6 +133,13 @@ class BasicTokenizer(object):
...
@@ -133,6 +133,13 @@ class BasicTokenizer(object):
"""Tokenizes a piece of text."""
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
split_tokens
=
[]
for
token
in
orig_tokens
:
for
token
in
orig_tokens
:
...
@@ -174,7 +181,42 @@ class BasicTokenizer(object):
...
@@ -174,7 +181,42 @@ class BasicTokenizer(object):
i
+=
1
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
((
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
#
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
or
#
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
or
#
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
or
#
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
or
#
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
#
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
output
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment