Unverified Commit eb3b4dc9 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

avoid split chinese characters during decoding (#566)

parent 9c3634ec
......@@ -156,6 +156,11 @@ class AsyncEngine:
# decode res
response = self.tokenizer.decode(res.tolist(),
offset=response_size)
# utf-8 char at the end means it's a potential unfinished
# byte sequence, continue to concate it with the next
# sequence and decode them together
if response.endswith('�'):
continue
# response, history token len,
# input token len, gen token len
yield GenOut(response, self.steps[str(session_id)],
......@@ -249,6 +254,11 @@ class AsyncEngine:
# decode res
response = self.tokenizer.decode(res.tolist(),
offset=response_size)
# utf-8 char at the end means it's a potential unfinished
# byte sequence, continue to concate it with the next
# sequence and decode them together
if response.endswith('�'):
continue
# response, history len, input len, generation len
yield GenOut(response, self.steps[str(session_id)],
len(input_ids), tokens, finish_reason)
......
......@@ -657,8 +657,13 @@ class Chatbot:
continue
output_str = postprocess(
output_ids, np.array([[n_token]], dtype=np.uint32))
n_token = output_ids.shape[-1]
text = output_str[0].decode()
# utf-8 char at the end means it's a potential unfinished
# byte sequence, continue to concate it with the next
# sequence and decode them together
if text.endswith('�'):
continue
n_token = output_ids.shape[-1]
if display:
print(text, end='', flush=True)
session.response += text
......
......@@ -145,6 +145,11 @@ def main(model_path,
res, tokens = outputs[0]
# decode res
response = tokenizer.decode(res.tolist(), offset=response_size)
# utf-8 char at the end means it's a potential unfinished
# byte sequence, continue to concate it with the next
# sequence and decode them together
if response.endswith('�'):
continue
response = valid_str(response)
print(f'{response}', end='', flush=True)
response_size = tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment