Unverified Commit beac8dd4 authored by Ricardo Lu's avatar Ricardo Lu Committed by GitHub
Browse files

fix: don't skip first special token. (#1497)

parent 28b47d1e
...@@ -120,7 +120,11 @@ def detokenize_incrementally( ...@@ -120,7 +120,11 @@ def detokenize_incrementally(
# tokenizers (bigger = more conservative). # tokenizers (bigger = more conservative).
# Subtract 1 extra to account for the generated token. # Subtract 1 extra to account for the generated token.
prefix_offset = max(len(output_tokens) - 6, 0) prefix_offset = max(len(output_tokens) - 6, 0)
read_offset = max(len(output_tokens) - 1, 0) # If the first new token is a special token, we can't skip 1 extra token
if skip_special_tokens and new_token_id in tokenizer.all_special_ids:
read_offset = max(len(output_tokens), 0)
else:
read_offset = max(len(output_tokens) - 1, 0)
else: else:
# Put new_token_id in a list so skip_special_tokens is respected # Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens( new_tokens = tokenizer.convert_ids_to_tokens(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment