Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
28f26c10
"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "c9035e453799adf897a000953d4b83ff764263cc"
Unverified
Commit
28f26c10
authored
Apr 15, 2023
by
bcol
Committed by
GitHub
Apr 15, 2023
Browse files
Generate: add CJK support to TextStreamer (#22664)
parent
fb3aa06c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
0 deletions
+28
-0
src/transformers/generation/streamers.py
src/transformers/generation/streamers.py
+28
-0
No files found.
src/transformers/generation/streamers.py
View file @
28f26c10
...
@@ -101,6 +101,10 @@ class TextStreamer(BaseStreamer):
...
@@ -101,6 +101,10 @@ class TextStreamer(BaseStreamer):
printable_text
=
text
[
self
.
print_len
:]
printable_text
=
text
[
self
.
print_len
:]
self
.
token_cache
=
[]
self
.
token_cache
=
[]
self
.
print_len
=
0
self
.
print_len
=
0
# If the last token is a CJK character, we print the characters.
elif
len
(
text
)
>
0
and
self
.
_is_chinese_char
(
ord
(
text
[
-
1
])):
printable_text
=
text
[
self
.
print_len
:]
self
.
print_len
+=
len
(
printable_text
)
# Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
# Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
# which may change with the subsequent token -- there are probably smarter ways to do this!)
# which may change with the subsequent token -- there are probably smarter ways to do this!)
else
:
else
:
...
@@ -127,6 +131,30 @@ class TextStreamer(BaseStreamer):
...
@@ -127,6 +131,30 @@ class TextStreamer(BaseStreamer):
"""Prints the new text to stdout. If the stream is ending, also prints a newline."""
"""Prints the new text to stdout. If the stream is ending, also prints a newline."""
print
(
text
,
flush
=
True
,
end
=
""
if
not
stream_end
else
None
)
print
(
text
,
flush
=
True
,
end
=
""
if
not
stream_end
else
None
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
(
(
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
#
or
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
#
or
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
#
or
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
#
or
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
#
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)
#
):
#
return
True
return
False
class
TextIteratorStreamer
(
TextStreamer
):
class
TextIteratorStreamer
(
TextStreamer
):
"""
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment