Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
e0eaa1ed
"megatron/git@developer.sourcefind.cn:OpenDAS/megatron-lm.git" did not exist on "6728a780dc76ff9684217033d9127f7b1186230b"
Commit
e0eaa1ed
authored
Mar 07, 2020
by
Sergey Mironov
Browse files
Update tokenizer: unhardcode alphanumeric char set
parent
7708847e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
33 additions
and
16 deletions
+33
-16
official/nlp/transformer/utils/tokenizer.py
official/nlp/transformer/utils/tokenizer.py
+33
-16
No files found.
official/nlp/transformer/utils/tokenizer.py
View file @
e0eaa1ed
...
@@ -45,12 +45,15 @@ _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
...
@@ -45,12 +45,15 @@ _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
_UNDEFINED_UNICODE
=
u
"
\u3013
"
_UNDEFINED_UNICODE
=
u
"
\u3013
"
# Set contains all letter and
num
b
er
char
acters.
def
alpha
numer
ic_
char
_set
():
_ALPHANUMERIC_CHAR_SET
=
set
(
return
set
(
six
.
unichr
(
i
)
for
i
in
xrange
(
sys
.
maxunicode
)
six
.
unichr
(
i
)
for
i
in
xrange
(
sys
.
maxunicode
)
if
(
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"L"
)
or
if
(
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"L"
)
or
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"N"
)))
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"N"
)))
# Set contains all letter and number characters.
_ALPHANUMERIC_CHAR_SET
=
alphanumeric_char_set
()
# min_count is the minimum number of times a subtoken must appear in the data
# min_count is the minimum number of times a subtoken must appear in the data
# before before it is added to the vocabulary. The value is found using binary
# before before it is added to the vocabulary. The value is found using binary
# search to obtain the target vocabulary size.
# search to obtain the target vocabulary size.
...
@@ -61,11 +64,15 @@ _MAX_MIN_COUNT = 1000 # max value to use when binary searching for min_count
...
@@ -61,11 +64,15 @@ _MAX_MIN_COUNT = 1000 # max value to use when binary searching for min_count
class
Subtokenizer
(
object
):
class
Subtokenizer
(
object
):
"""Encodes and decodes strings to/from integer IDs."""
"""Encodes and decodes strings to/from integer IDs."""
def
__init__
(
self
,
vocab_file
,
reserved_tokens
=
None
):
def
__init__
(
self
,
vocab_file
,
reserved_tokens
=
None
,
master_char_set
=
None
):
"""Initializes class, creating a vocab file if data_files is provided."""
"""Initializes class, creating a vocab file if data_files is provided."""
tf
.
compat
.
v1
.
logging
.
info
(
"Initializing Subtokenizer from file %s."
%
tf
.
compat
.
v1
.
logging
.
info
(
"Initializing Subtokenizer from file %s."
%
vocab_file
)
vocab_file
)
if
master_char_set
is
None
:
master_char_set
=
_ALPHANUMERIC_CHAR_SET
if
reserved_tokens
is
None
:
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
reserved_tokens
=
RESERVED_TOKENS
...
@@ -80,11 +87,13 @@ class Subtokenizer(object):
...
@@ -80,11 +87,13 @@ class Subtokenizer(object):
# Create cache to speed up subtokenization
# Create cache to speed up subtokenization
self
.
_cache_size
=
2
**
20
self
.
_cache_size
=
2
**
20
self
.
_cache
=
[(
None
,
None
)]
*
self
.
_cache_size
self
.
_cache
=
[(
None
,
None
)]
*
self
.
_cache_size
self
.
_master_char_set
=
master_char_set
@
staticmethod
@
staticmethod
def
init_from_files
(
def
init_from_files
(
vocab_file
,
files
,
target_vocab_size
,
threshold
,
min_count
=
None
,
vocab_file
,
files
,
target_vocab_size
,
threshold
,
min_count
=
None
,
file_byte_limit
=
1e6
,
reserved_tokens
=
None
,
correct_strip
=
True
):
file_byte_limit
=
1e6
,
reserved_tokens
=
None
,
correct_strip
=
True
,
master_char_set
=
None
):
"""Create subtoken vocabulary based on files, and save vocab to file.
"""Create subtoken vocabulary based on files, and save vocab to file.
Args:
Args:
...
@@ -105,6 +114,8 @@ class Subtokenizer(object):
...
@@ -105,6 +114,8 @@ class Subtokenizer(object):
Returns:
Returns:
Subtokenizer object
Subtokenizer object
"""
"""
if
master_char_set
is
None
:
master_char_set
=
_ALPHANUMERIC_CHAR_SET
if
reserved_tokens
is
None
:
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
reserved_tokens
=
RESERVED_TOKENS
...
@@ -112,7 +123,7 @@ class Subtokenizer(object):
...
@@ -112,7 +123,7 @@ class Subtokenizer(object):
tf
.
compat
.
v1
.
logging
.
info
(
"Vocab file already exists (%s)"
%
vocab_file
)
tf
.
compat
.
v1
.
logging
.
info
(
"Vocab file already exists (%s)"
%
vocab_file
)
else
:
else
:
tf
.
compat
.
v1
.
logging
.
info
(
"Begin steps to create subtoken vocabulary..."
)
tf
.
compat
.
v1
.
logging
.
info
(
"Begin steps to create subtoken vocabulary..."
)
token_counts
=
_count_tokens
(
files
,
file_byte_limit
,
correct_strip
)
token_counts
=
_count_tokens
(
files
,
file_byte_limit
,
correct_strip
,
master_char_set
)
alphabet
=
_generate_alphabet_dict
(
token_counts
)
alphabet
=
_generate_alphabet_dict
(
token_counts
)
subtoken_list
=
_generate_subtokens_with_target_vocab_size
(
subtoken_list
=
_generate_subtokens_with_target_vocab_size
(
token_counts
,
alphabet
,
target_vocab_size
,
threshold
,
min_count
,
token_counts
,
alphabet
,
target_vocab_size
,
threshold
,
min_count
,
...
@@ -120,12 +131,12 @@ class Subtokenizer(object):
...
@@ -120,12 +131,12 @@ class Subtokenizer(object):
tf
.
compat
.
v1
.
logging
.
info
(
"Generated vocabulary with %d subtokens."
%
tf
.
compat
.
v1
.
logging
.
info
(
"Generated vocabulary with %d subtokens."
%
len
(
subtoken_list
))
len
(
subtoken_list
))
_save_vocab_file
(
vocab_file
,
subtoken_list
)
_save_vocab_file
(
vocab_file
,
subtoken_list
)
return
Subtokenizer
(
vocab_file
)
return
Subtokenizer
(
vocab_file
,
master_char_set
=
master_char_set
)
def
encode
(
self
,
raw_string
,
add_eos
=
False
):
def
encode
(
self
,
raw_string
,
add_eos
=
False
):
"""Encodes a string into a list of int subtoken ids."""
"""Encodes a string into a list of int subtoken ids."""
ret
=
[]
ret
=
[]
tokens
=
_split_string_to_tokens
(
native_to_unicode
(
raw_string
))
tokens
=
_split_string_to_tokens
(
native_to_unicode
(
raw_string
)
,
self
.
_master_char_set
)
for
token
in
tokens
:
for
token
in
tokens
:
ret
.
extend
(
self
.
_token_to_subtoken_ids
(
token
))
ret
.
extend
(
self
.
_token_to_subtoken_ids
(
token
))
if
add_eos
:
if
add_eos
:
...
@@ -161,7 +172,8 @@ class Subtokenizer(object):
...
@@ -161,7 +172,8 @@ class Subtokenizer(object):
"Subtokens argument passed into decode() must be a list of integers."
)
"Subtokens argument passed into decode() must be a list of integers."
)
return
_unicode_to_native
(
return
_unicode_to_native
(
_join_tokens_to_string
(
self
.
_subtoken_ids_to_tokens
(
subtokens
)))
_join_tokens_to_string
(
self
.
_subtoken_ids_to_tokens
(
subtokens
),
self
.
_master_char_set
))
def
_subtoken_ids_to_tokens
(
self
,
subtokens
):
def
_subtoken_ids_to_tokens
(
self
,
subtokens
):
"""Convert list of int subtoken ids to a list of string tokens."""
"""Convert list of int subtoken ids to a list of string tokens."""
...
@@ -218,16 +230,16 @@ def _unicode_to_native(s):
...
@@ -218,16 +230,16 @@ def _unicode_to_native(s):
return
s
return
s
def
_split_string_to_tokens
(
text
):
def
_split_string_to_tokens
(
text
,
master_char_set
):
"""Splits text to a list of string tokens."""
"""Splits text to a list of string tokens."""
if
not
text
:
if
not
text
:
return
[]
return
[]
ret
=
[]
ret
=
[]
token_start
=
0
token_start
=
0
# Classify each character in the input string
# Classify each character in the input string
is_
alnum
=
[
c
in
_ALPHANUMERIC_CHAR_SET
for
c
in
text
]
is_
master
=
[
c
in
master_char_set
for
c
in
text
]
for
pos
in
xrange
(
1
,
len
(
text
)):
for
pos
in
xrange
(
1
,
len
(
text
)):
if
is_
alnum
[
pos
]
!=
is_
alnum
[
pos
-
1
]:
if
is_
master
[
pos
]
!=
is_
master
[
pos
-
1
]:
token
=
text
[
token_start
:
pos
]
token
=
text
[
token_start
:
pos
]
if
token
!=
u
" "
or
token_start
==
0
:
if
token
!=
u
" "
or
token_start
==
0
:
ret
.
append
(
token
)
ret
.
append
(
token
)
...
@@ -237,12 +249,12 @@ def _split_string_to_tokens(text):
...
@@ -237,12 +249,12 @@ def _split_string_to_tokens(text):
return
ret
return
ret
def
_join_tokens_to_string
(
tokens
):
def
_join_tokens_to_string
(
tokens
,
master_char_set
):
"""Join a list of string tokens into a single string."""
"""Join a list of string tokens into a single string."""
token_is_
alnum
=
[
t
[
0
]
in
_ALPHANUMERIC_CHAR_SET
for
t
in
tokens
]
token_is_
master
=
[
t
[
0
]
in
master_char_set
for
t
in
tokens
]
ret
=
[]
ret
=
[]
for
i
,
token
in
enumerate
(
tokens
):
for
i
,
token
in
enumerate
(
tokens
):
if
i
>
0
and
token_is_
alnum
[
i
-
1
]
and
token_is_
alnum
[
i
]:
if
i
>
0
and
token_is_
master
[
i
-
1
]
and
token_is_
master
[
i
]:
ret
.
append
(
u
" "
)
ret
.
append
(
u
" "
)
ret
.
append
(
token
)
ret
.
append
(
token
)
return
""
.
join
(
ret
)
return
""
.
join
(
ret
)
...
@@ -324,7 +336,8 @@ def _unescape_token(token):
...
@@ -324,7 +336,8 @@ def _unescape_token(token):
return
_UNESCAPE_REGEX
.
sub
(
match
,
token
)
return
_UNESCAPE_REGEX
.
sub
(
match
,
token
)
def
_count_tokens
(
files
,
file_byte_limit
=
1e6
,
correct_strip
=
True
):
def
_count_tokens
(
files
,
file_byte_limit
=
1e6
,
correct_strip
=
True
,
master_char_set
=
None
):
"""Return token counts of words in the files.
"""Return token counts of words in the files.
Samples file_byte_limit bytes from each file, and counts the words that appear
Samples file_byte_limit bytes from each file, and counts the words that appear
...
@@ -342,6 +355,9 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
...
@@ -342,6 +355,9 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
Dictionary mapping tokens to the number of times they appear in the sampled
Dictionary mapping tokens to the number of times they appear in the sampled
lines from the files.
lines from the files.
"""
"""
if
master_char_set
is
None
:
master_char_set
=
_ALPHANUMERIC_CHAR_SET
token_counts
=
collections
.
defaultdict
(
int
)
token_counts
=
collections
.
defaultdict
(
int
)
for
filepath
in
files
:
for
filepath
in
files
:
...
@@ -362,7 +378,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
...
@@ -362,7 +378,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
counter
=
0
counter
=
0
# Add words to token counts
# Add words to token counts
for
token
in
_split_string_to_tokens
(
native_to_unicode
(
line
)):
for
token
in
_split_string_to_tokens
(
native_to_unicode
(
line
),
master_char_set
):
token_counts
[
token
]
+=
1
token_counts
[
token
]
+=
1
return
token_counts
return
token_counts
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment