Commit a6ac49a3 authored by Greg Pereira's avatar Greg Pereira Committed by khluu
Browse files

[Bugfix] Fix invalid JSON in Gemma 4 streaming tool calls by stripping partial delimiters (#38992)


Signed-off-by: default avatargreg pereira <grpereir@redhat.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
(cherry picked from commit f53fa26e)
(cherry picked from commit 10a26d1d9ae2572dc3161af2cc3832f27930774e)
parent 2a69949b
...@@ -502,3 +502,32 @@ class TestStreamingExtraction: ...@@ -502,3 +502,32 @@ class TestStreamingExtraction:
results = self._simulate_streaming(parser, mock_request, chunks) results = self._simulate_streaming(parser, mock_request, chunks)
name = self._collect_function_name(results) name = self._collect_function_name(results)
assert name == "get_status" assert name == "get_status"
def test_streaming_split_delimiter_no_invalid_json(self, parser, mock_request):
"""Partial <|"|> delimiter chars must not leak into streamed JSON.
Reproduces the bug from https://github.com/vllm-project/vllm/issues/38946
where a token boundary splits the string delimiter, leaving fragments
like '<|' at the end of a parsed value which then corrupt the JSON.
"""
chunks = [
"<|tool_call>",
"call:todowrite{",
'content:<|"|>Buy milk<|',
'"|>}',
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text, "No arguments were streamed"
# Must be valid JSON — the original bug caused a JSON parse error
parsed_args = json.loads(args_text)
assert parsed_args["content"] == "Buy milk"
# Ensure no raw delimiter fragments leaked into the JSON
assert "<|" not in args_text, (
f"Partial delimiter leaked into JSON: {args_text!r}"
)
...@@ -675,10 +675,11 @@ class Gemma4ToolParser(ToolParser): ...@@ -675,10 +675,11 @@ class Gemma4ToolParser(ToolParser):
current_args_json = json.dumps(current_args, ensure_ascii=False) current_args_json = json.dumps(current_args, ensure_ascii=False)
# Withhold trailing closing characters that may shift as more # Withhold trailing closing characters that may shift as more
# tokens arrive. Strip trailing '}', '"', and ']' sequences # tokens arrive. Strip trailing '}', '"', ']' and partial
# to get the "safe prefix". # STRING_DELIM fragments ('<', '|', '\\', '>') to get the
# "safe prefix".
safe_json = current_args_json safe_json = current_args_json
while safe_json and safe_json[-1] in ("}", '"', "]"): while safe_json and safe_json[-1] in ("}", '"', "]", "<", "|", "\\", ">"):
safe_json = safe_json[:-1] safe_json = safe_json[:-1]
prev_streamed = self.streamed_args_for_tool[self.current_tool_id] prev_streamed = self.streamed_args_for_tool[self.current_tool_id]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment