Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
558bab5d
Commit
558bab5d
authored
Dec 09, 2019
by
Chen Chen
Committed by
A. Unique TensorFlower
Dec 09, 2019
Browse files
Add sentence piece tokenizer in tokenization.py
PiperOrigin-RevId: 284624714
parent
9cae3c4f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
131 additions
and
1 deletion
+131
-1
official/nlp/bert/tokenization.py
official/nlp/bert/tokenization.py
+131
-1
No files found.
official/nlp/bert/tokenization.py
View file @
558bab5d
# coding=utf-8
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
...
...
@@ -29,6 +30,10 @@ import unicodedata
import
six
import
tensorflow
as
tf
import
sentencepiece
as
spm
SPIECE_UNDERLINE
=
u
"▁"
.
encode
(
"utf-8"
)
def
validate_case_matches_checkpoint
(
do_lower_case
,
init_checkpoint
):
"""Checks whether the casing config is consistent with the checkpoint name."""
...
...
@@ -366,7 +371,7 @@ class WordpieceTokenizer(object):
def
_is_whitespace
(
char
):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically cont
o
rl characters but we treat them
# \t, \n, and \r are technically contr
o
l characters but we treat them
# as whitespace since they are generally considered as such.
if
char
==
" "
or
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
True
...
...
@@ -402,3 +407,128 @@ def _is_punctuation(char):
if
cat
.
startswith
(
"P"
):
return
True
return
False
def
preprocess_text
(
inputs
,
remove_space
=
True
,
lower
=
False
):
"""Preprocesses data by removing extra space and normalize data.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py
Args:
inputs: The input text.
remove_space: Whether to remove the extra space.
lower: Whether to lowercase the text.
Returns:
The preprocessed text.
"""
outputs
=
inputs
if
remove_space
:
outputs
=
" "
.
join
(
inputs
.
strip
().
split
())
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
try
:
outputs
=
six
.
ensure_text
(
outputs
,
"utf-8"
)
except
UnicodeDecodeError
:
outputs
=
six
.
ensure_text
(
outputs
,
"latin-1"
)
outputs
=
unicodedata
.
normalize
(
"NFKD"
,
outputs
)
outputs
=
""
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
if
lower
:
outputs
=
outputs
.
lower
()
return
outputs
def
encode_pieces
(
sp_model
,
text
,
sample
=
False
):
"""Segements text into pieces.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py
Args:
sp_model: A spm.SentencePieceProcessor object.
text: The input text to be segemented.
sample: Whether to randomly sample a segmentation output or return a
deterministic one.
Returns:
A list of token pieces.
"""
if
not
sample
:
pieces
=
sp_model
.
EncodeAsPieces
(
text
)
else
:
pieces
=
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
new_pieces
=
[]
for
piece
in
pieces
:
piece
=
printable_text
(
piece
)
if
len
(
piece
)
>
1
and
piece
[
-
1
]
==
","
and
piece
[
-
2
].
isdigit
():
cur_pieces
=
sp_model
.
EncodeAsPieces
(
six
.
ensure_binary
(
piece
[:
-
1
]).
replace
(
SPIECE_UNDERLINE
,
b
""
))
if
piece
[
0
]
!=
SPIECE_UNDERLINE
and
cur_pieces
[
0
][
0
]
==
SPIECE_UNDERLINE
:
if
len
(
cur_pieces
[
0
])
==
1
:
cur_pieces
=
cur_pieces
[
1
:]
else
:
cur_pieces
[
0
]
=
cur_pieces
[
0
][
1
:]
cur_pieces
.
append
(
piece
[
-
1
])
new_pieces
.
extend
(
cur_pieces
)
else
:
new_pieces
.
append
(
piece
)
return
new_pieces
def
encode_ids
(
sp_model
,
text
,
sample
=
False
):
"""Segments text and return token ids.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py
Args:
sp_model: A spm.SentencePieceProcessor object.
text: The input text to be segemented.
sample: Whether to randomly sample a segmentation output or return a
deterministic one.
Returns:
A list of token ids.
"""
pieces
=
encode_pieces
(
sp_model
,
text
,
sample
=
sample
)
ids
=
[
sp_model
.
PieceToId
(
piece
)
for
piece
in
pieces
]
return
ids
class
FullSentencePieceTokenizer
(
object
):
"""Runs end-to-end sentence piece tokenization.
The interface of this class is intended to keep the same as above
`FullTokenizer` class for easier usage.
"""
def
__init__
(
self
,
sp_model_file
):
"""Inits FullSentencePieceTokenizer.
Args:
sp_model_file: The path to the sentence piece model file.
"""
self
.
_sp_model
=
spm
.
SentencePieceProcessor
()
self
.
_sp_model
.
Load
(
sp_model_file
)
self
.
vocab
=
{
self
.
_sp_model
.
IdToPiece
(
i
):
i
for
i
in
six
.
moves
.
range
(
self
.
_sp_model
.
GetPieceSize
())
}
def
tokenize
(
self
,
text
):
"""Tokenizes text into pieces."""
return
encode_pieces
(
self
.
_sp_model
,
text
)
def
convert_tokens_to_ids
(
self
,
tokens
):
"""Converts a list of tokens to a list of ids."""
return
[
self
.
_sp_model
.
PieceToId
(
printable_text
(
token
))
for
token
in
tokens
]
def
convert_ids_to_tokens
(
self
,
ids
):
"""Converts a list of ids ot a list of tokens."""
return
[
self
.
_sp_model
.
IdToPiece
(
id_
)
for
id_
in
ids
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment