Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e12bbe3b
Unverified
Commit
e12bbe3b
authored
Oct 05, 2022
by
Shyam Sudhakaran
Committed by
GitHub
Oct 05, 2022
Browse files
Remove bert interdependency from clip tokenizer (#19332)
parent
512fa41c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
162 additions
and
2 deletions
+162
-2
src/transformers/models/clip/tokenization_clip.py
src/transformers/models/clip/tokenization_clip.py
+162
-2
No files found.
src/transformers/models/clip/tokenization_clip.py
View file @
e12bbe3b
...
...
@@ -16,13 +16,13 @@
import
json
import
os
import
unicodedata
from
functools
import
lru_cache
from
typing
import
List
,
Optional
,
Tuple
import
regex
as
re
from
transformers.models.bert.tokenization_bert
import
BasicTokenizer
from
...tokenization_utils
import
AddedToken
,
PreTrainedTokenizer
from
...tokenization_utils
import
AddedToken
,
PreTrainedTokenizer
,
_is_control
,
_is_punctuation
,
_is_whitespace
from
...utils
import
logging
...
...
@@ -97,6 +97,166 @@ def whitespace_clean(text):
return
text
# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class
BasicTokenizer
(
object
):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
"""
def
__init__
(
self
,
do_lower_case
=
True
,
never_split
=
None
,
tokenize_chinese_chars
=
True
,
strip_accents
=
None
):
if
never_split
is
None
:
never_split
=
[]
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
set
(
never_split
)
self
.
tokenize_chinese_chars
=
tokenize_chinese_chars
self
.
strip_accents
=
strip_accents
def
tokenize
(
self
,
text
,
never_split
=
None
):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split
=
self
.
never_split
.
union
(
set
(
never_split
))
if
never_split
else
self
.
never_split
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
if
self
.
tokenize_chinese_chars
:
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
token
not
in
never_split
:
if
self
.
do_lower_case
:
token
=
token
.
lower
()
if
self
.
strip_accents
is
not
False
:
token
=
self
.
_run_strip_accents
(
token
)
elif
self
.
strip_accents
:
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
,
never_split
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
def
_run_strip_accents
(
self
,
text
):
"""Strips accents from a piece of text."""
text
=
unicodedata
.
normalize
(
"NFD"
,
text
)
output
=
[]
for
char
in
text
:
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Mn"
:
continue
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
,
never_split
=
None
):
"""Splits punctuation on a piece of text."""
if
never_split
is
not
None
and
text
in
never_split
:
return
[
text
]
chars
=
list
(
text
)
i
=
0
start_new_word
=
True
output
=
[]
while
i
<
len
(
chars
):
char
=
chars
[
i
]
if
_is_punctuation
(
char
):
output
.
append
([
char
])
start_new_word
=
True
else
:
if
start_new_word
:
output
.
append
([])
start_new_word
=
False
output
[
-
1
].
append
(
char
)
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
(
(
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
#
or
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
#
or
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
#
or
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
#
or
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
#
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)
#
):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xFFFD
or
_is_control
(
char
):
continue
if
_is_whitespace
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
class
CLIPTokenizer
(
PreTrainedTokenizer
):
"""
Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment