Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
28f26c10
Unverified
Commit
28f26c10
authored
Apr 15, 2023
by
bcol
Committed by
GitHub
Apr 15, 2023
Browse files
Generate: add CJK support to TextStreamer (#22664)
parent
fb3aa06c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
0 deletions
+28
-0
src/transformers/generation/streamers.py
src/transformers/generation/streamers.py
+28
-0
No files found.
src/transformers/generation/streamers.py
View file @
28f26c10
...
...
@@ -101,6 +101,10 @@ class TextStreamer(BaseStreamer):
printable_text
=
text
[
self
.
print_len
:]
self
.
token_cache
=
[]
self
.
print_len
=
0
# If the last token is a CJK character, we print the characters.
elif
len
(
text
)
>
0
and
self
.
_is_chinese_char
(
ord
(
text
[
-
1
])):
printable_text
=
text
[
self
.
print_len
:]
self
.
print_len
+=
len
(
printable_text
)
# Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
# which may change with the subsequent token -- there are probably smarter ways to do this!)
else
:
...
...
@@ -127,6 +131,30 @@ class TextStreamer(BaseStreamer):
"""Prints the new text to stdout. If the stream is ending, also prints a newline."""
print
(
text
,
flush
=
True
,
end
=
""
if
not
stream_end
else
None
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
(
(
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
#
or
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
#
or
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
#
or
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
#
or
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
#
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)
#
):
#
return
True
return
False
class
TextIteratorStreamer
(
TextStreamer
):
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment