Unverified Commit b94f3021 authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

fix(server): Use cleanup_tokenization_spaces=False for lossless decoding (#13)

Fixes #12 in the easiest way I could think of.
parent 60472f9d
...@@ -354,7 +354,8 @@ class CausalLM(Model): ...@@ -354,7 +354,8 @@ class CausalLM(Model):
if stop: if stop:
# Decode all tokens # Decode all tokens
output_text = self.tokenizer.decode( output_text = self.tokenizer.decode(
all_input_ids.squeeze(-1), skip_special_tokens=True all_input_ids.squeeze(-1), skip_special_tokens=True,
cleanup_tokenization_spaces=False
) )
# Slice with input_length to remove padding # Slice with input_length to remove padding
token_ids = all_input_ids[-new_input_length:] token_ids = all_input_ids[-new_input_length:]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment