Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e12bbe3b
"examples/vscode:/vscode.git/clone" did not exist on "4eec5d0cf67116e98770c305640b5710571da4f6"
Unverified
Commit
e12bbe3b
authored
Oct 05, 2022
by
Shyam Sudhakaran
Committed by
GitHub
Oct 05, 2022
Browse files
Remove bert interdependency from clip tokenizer (#19332)
parent
512fa41c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
162 additions
and
2 deletions
+162
-2
src/transformers/models/clip/tokenization_clip.py
src/transformers/models/clip/tokenization_clip.py
+162
-2
No files found.
src/transformers/models/clip/tokenization_clip.py
View file @
e12bbe3b
...
...
@@ -16,13 +16,13 @@
import
json
import
os
import
unicodedata
from
functools
import
lru_cache
from
typing
import
List
,
Optional
,
Tuple
import
regex
as
re
from
transformers.models.bert.tokenization_bert
import
BasicTokenizer
from
...tokenization_utils
import
AddedToken
,
PreTrainedTokenizer
from
...tokenization_utils
import
AddedToken
,
PreTrainedTokenizer
,
_is_control
,
_is_punctuation
,
_is_whitespace
from
...utils
import
logging
...
...
@@ -97,6 +97,166 @@ def whitespace_clean(text):
return
text
# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class
BasicTokenizer
(
object
):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
"""
def
__init__
(
self
,
do_lower_case
=
True
,
never_split
=
None
,
tokenize_chinese_chars
=
True
,
strip_accents
=
None
):
if
never_split
is
None
:
never_split
=
[]
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
set
(
never_split
)
self
.
tokenize_chinese_chars
=
tokenize_chinese_chars
self
.
strip_accents
=
strip_accents
def
tokenize
(
self
,
text
,
never_split
=
None
):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split
=
self
.
never_split
.
union
(
set
(
never_split
))
if
never_split
else
self
.
never_split
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
if
self
.
tokenize_chinese_chars
:
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
token
not
in
never_split
:
if
self
.
do_lower_case
:
token
=
token
.
lower
()
if
self
.
strip_accents
is
not
False
:
token
=
self
.
_run_strip_accents
(
token
)
elif
self
.
strip_accents
:
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
,
never_split
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
def
_run_strip_accents
(
self
,
text
):
"""Strips accents from a piece of text."""
text
=
unicodedata
.
normalize
(
"NFD"
,
text
)
output
=
[]
for
char
in
text
:
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Mn"
:
continue
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
,
never_split
=
None
):
"""Splits punctuation on a piece of text."""
if
never_split
is
not
None
and
text
in
never_split
:
return
[
text
]
chars
=
list
(
text
)
i
=
0
start_new_word
=
True
output
=
[]
while
i
<
len
(
chars
):
char
=
chars
[
i
]
if
_is_punctuation
(
char
):
output
.
append
([
char
])
start_new_word
=
True
else
:
if
start_new_word
:
output
.
append
([])
start_new_word
=
False
output
[
-
1
].
append
(
char
)
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
(
(
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
#
or
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
#
or
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
#
or
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
#
or
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
#
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)
#
):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xFFFD
or
_is_control
(
char
):
continue
if
_is_whitespace
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
class
CLIPTokenizer
(
PreTrainedTokenizer
):
"""
Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment