Unverified Commit 86c69dc5 authored by Shinichi Hemmi's avatar Shinichi Hemmi Committed by GitHub
Browse files

[Bugfix] Fix byte fallback handling when using outlines (#31391)


Signed-off-by: default avatarShinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Co-authored-by: default avatarKenichi Maehashi <maehashi@preferred.jp>
parent 7c5dedc2
...@@ -122,7 +122,12 @@ class OutlinesGrammar(StructuredOutputGrammar): ...@@ -122,7 +122,12 @@ class OutlinesGrammar(StructuredOutputGrammar):
Returns False if the FSM failed to advance. Returns False if the FSM failed to advance.
""" """
if self.guide.accepts_tokens(tokens): if self.guide.accepts_tokens(tokens):
# Advance cannot fail because we checked Guide.accepts_tokens() # Advance can fail when the next state reached after advancing with
# the current tokens is a dead state. This is because Guide.accepts_tokens()
# only checks whether the current tokens can be accepted,
# whereas guide.advance() additionally checks the next state
# after all tokens are accepted.
# We need to be aware that the FSM must be prepared without dead states.
for t in tokens: for t in tokens:
self.guide.advance(t) self.guide.advance(t)
self.num_processed_tokens += 1 self.num_processed_tokens += 1
......
...@@ -226,7 +226,9 @@ def _reduced_vocabulary( ...@@ -226,7 +226,9 @@ def _reduced_vocabulary(
# by this point. # by this point.
token_bytes = bytes(token_str) # type: ignore[arg-type] token_bytes = bytes(token_str) # type: ignore[arg-type]
elif "\ufffd" in token_str and not re_replacement_seq.match(token_str): elif (token_str == "\ufffd" and token != "\ufffd") or (
"\ufffd" in token_str and not re_replacement_seq.match(token_str)
):
# Handle tokens with invalid UTF-8 sequences. # Handle tokens with invalid UTF-8 sequences.
if re_llama_byte_token.match(token): if re_llama_byte_token.match(token):
# Llama-like tokenizers use <0xXX> for incomplete sequences. # Llama-like tokenizers use <0xXX> for incomplete sequences.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment