Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a99300bd
Commit
a99300bd
authored
Sep 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev
parents
cc3e01c7
5438967f
Changes
512
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1728 additions
and
459 deletions
+1728
-459
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+251
-159
vllm/entrypoints/openai/serving_score.py
vllm/entrypoints/openai/serving_score.py
+7
-7
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_tokenization.py
+3
-1
vllm/entrypoints/openai/serving_transcription.py
vllm/entrypoints/openai/serving_transcription.py
+6
-2
vllm/entrypoints/openai/speech_to_text.py
vllm/entrypoints/openai/speech_to_text.py
+19
-2
vllm/entrypoints/openai/tool_parsers/__init__.py
vllm/entrypoints/openai/tool_parsers/__init__.py
+4
-0
vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
...ntrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+367
-0
vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
...entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
...ypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
.../entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+4
-3
vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+2
-2
vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
...entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+286
-243
vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+679
-0
vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+83
-25
vllm/entrypoints/utils.py
vllm/entrypoints/utils.py
+3
-1
No files found.
Too many changes to show.
To preserve performance only
512 of 512+
files are displayed.
Plain diff
Email patch
vllm/entrypoints/openai/serving_responses.py
View file @
a99300bd
...
...
@@ -4,11 +4,11 @@
import
asyncio
import
json
import
time
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
,
Sequence
from
contextlib
import
AsyncExitStack
from
copy
import
copy
from
http
import
HTTPStatus
from
typing
import
Any
,
Callable
,
Final
,
Optional
,
Union
from
typing
import
Callable
,
Final
,
Optional
,
Union
import
jinja2
import
openai.types.responses
as
openai_responses_types
...
...
@@ -25,6 +25,8 @@ from openai.types.responses import (ResponseCreatedEvent,
ResponseReasoningItem
,
ResponseReasoningTextDeltaEvent
,
ResponseReasoningTextDoneEvent
)
from
openai.types.responses.response_output_text
import
(
Logprob
,
LogprobTopLogprob
)
# yapf: enable
from
openai.types.responses.response_reasoning_item
import
(
Content
as
ResponseReasoningTextContent
)
...
...
@@ -59,6 +61,8 @@ from vllm.logger import init_logger
from
vllm.outputs
import
CompletionOutput
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
Logprob
as
SampleLogprob
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
random_uuid
...
...
@@ -84,6 +88,7 @@ class OpenAIServingResponses(OpenAIServing):
enable_prompt_tokens_details
:
bool
=
False
,
enable_force_include_usage
:
bool
=
False
,
enable_log_outputs
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
...
...
@@ -92,6 +97,7 @@ class OpenAIServingResponses(OpenAIServing):
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
enable_force_include_usage
=
enable_force_include_usage
,
log_error_stack
=
log_error_stack
,
)
self
.
chat_template
=
chat_template
...
...
@@ -201,6 +207,12 @@ class OpenAIServingResponses(OpenAIServing):
# (i.e., their request's `store=True` just because it's the default
# value).
request
.
store
=
False
if
self
.
use_harmony
and
request
.
is_include_output_logprobs
():
return
self
.
create_error_response
(
err_type
=
"invalid_request_error"
,
message
=
"logprobs are not supported with gpt-oss models"
,
status_code
=
HTTPStatus
.
BAD_REQUEST
,
)
# Handle the previous response ID.
prev_response_id
=
request
.
previous_response_id
...
...
@@ -238,10 +250,10 @@ class OpenAIServingResponses(OpenAIServing):
raw_request
.
state
.
request_metadata
=
request_metadata
if
self
.
tool_server
is
not
None
and
isinstance
(
self
.
tool_server
,
MCPToolServer
)
and
(
request
.
background
or
request
.
stream
)
and
request
.
tools
and
any
(
tool
.
type
in
[
"web_search_preview"
,
"code_interpreter"
]
for
tool
in
request
.
tools
):
self
.
tool_server
,
MCPToolServer
)
and
request
.
stream
and
request
.
tools
and
any
(
tool
.
type
in
[
"web_search_preview"
,
"code_interpreter"
]
for
tool
in
request
.
tools
):
return
self
.
create_error_response
(
"MCP tool server is not supported in background mode and "
"streaming mode"
)
...
...
@@ -255,114 +267,70 @@ class OpenAIServingResponses(OpenAIServing):
builtin_tool_list
.
append
(
"browser"
)
if
self
.
tool_server
.
has_tool
(
"python"
):
builtin_tool_list
.
append
(
"python"
)
async
with
AsyncExitStack
()
as
exit_stack
:
try
:
if
self
.
tool_server
is
not
None
:
# TODO: initialize tool sessions lazily when the session
# is actually used.
tool_session_ctxs
:
dict
[
str
,
Any
]
=
{
tool_name
:
exit_stack
.
enter_async_context
(
self
.
tool_server
.
new_session
(
tool_name
))
for
tool_name
in
builtin_tool_list
}
tool_sessions
=
{}
for
tool_name
in
builtin_tool_list
:
tool_sessions
[
tool_name
]
=
(
await
tool_session_ctxs
[
tool_name
])
else
:
assert
len
(
builtin_tool_list
)
==
0
tool_sessions
=
{}
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
default_max_tokens
=
self
.
max_model_len
-
len
(
engine_prompt
[
"prompt_token_ids"
])
sampling_params
=
request
.
to_sampling_params
(
default_max_tokens
,
self
.
default_sampling_params
)
trace_headers
=
(
None
if
raw_request
is
None
else
await
self
.
_get_trace_headers
(
raw_request
.
headers
))
context
:
ConversationContext
if
self
.
use_harmony
:
if
request
.
stream
:
context
=
StreamingHarmonyContext
(
messages
,
tool_sessions
)
else
:
context
=
HarmonyContext
(
messages
,
tool_sessions
)
else
:
context
=
SimpleContext
()
generator
=
self
.
_generate_with_builtin_tools
(
request_id
=
request
.
request_id
,
request_prompt
=
request_prompts
[
i
],
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
context
=
context
,
lora_request
=
lora_request
,
priority
=
request
.
priority
,
trace_headers
=
trace_headers
,
)
generators
.
append
(
generator
)
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
assert
len
(
generators
)
==
1
result_generator
,
=
generators
# Store the input messages.
if
request
.
store
:
self
.
msg_store
[
request
.
request_id
]
=
messages
if
request
.
background
:
created_time
=
int
(
time
.
time
())
response
=
ResponsesResponse
.
from_request
(
request
,
sampling_params
,
model_name
=
model_name
,
created_time
=
created_time
,
output
=
[],
status
=
"queued"
,
usage
=
None
,
if
self
.
tool_server
is
not
None
:
available_tools
=
builtin_tool_list
else
:
assert
len
(
builtin_tool_list
)
==
0
available_tools
=
[]
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
default_max_tokens
=
self
.
max_model_len
-
len
(
engine_prompt
[
"prompt_token_ids"
])
sampling_params
=
request
.
to_sampling_params
(
default_max_tokens
,
self
.
default_sampling_params
)
trace_headers
=
(
None
if
raw_request
is
None
else
await
self
.
_get_trace_headers
(
raw_request
.
headers
))
context
:
ConversationContext
if
self
.
use_harmony
:
if
request
.
stream
:
context
=
StreamingHarmonyContext
(
messages
,
available_tools
)
else
:
context
=
HarmonyContext
(
messages
,
available_tools
)
else
:
context
=
SimpleContext
()
generator
=
self
.
_generate_with_builtin_tools
(
request_id
=
request
.
request_id
,
request_prompt
=
request_prompts
[
i
],
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
context
=
context
,
lora_request
=
lora_request
,
priority
=
request
.
priority
,
trace_headers
=
trace_headers
,
)
async
with
self
.
response_store_lock
:
self
.
response_store
[
response
.
id
]
=
response
generators
.
append
(
generator
)
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
# Run the request in the background.
task
=
asyncio
.
create_task
(
self
.
_run_background_request
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
,
),
name
=
f
"create_
{
response
.
id
}
"
,
)
assert
len
(
generators
)
==
1
result_generator
,
=
generators
# For cleanup.
response_id
=
response
.
id
self
.
background_tasks
[
response_id
]
=
task
task
.
add_done_callback
(
lambda
_
:
self
.
background_tasks
.
pop
(
response_id
,
None
))
return
response
# Store the input messages.
if
request
.
store
:
self
.
msg_store
[
request
.
request_id
]
=
messages
if
request
.
stream
:
return
self
.
responses_stream_generator
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
)
if
request
.
background
:
created_time
=
int
(
time
.
time
())
response
=
ResponsesResponse
.
from_request
(
request
,
sampling_params
,
model_name
=
model_name
,
created_time
=
created_time
,
output
=
[],
status
=
"queued"
,
usage
=
None
,
)
async
with
self
.
response_store_lock
:
self
.
response_store
[
response
.
id
]
=
response
try
:
return
await
self
.
responses_full_generator
(
# Run the request in the background.
task
=
asyncio
.
create_task
(
self
.
_run_background_request
(
request
,
sampling_params
,
result_generator
,
...
...
@@ -370,10 +338,41 @@ class OpenAIServingResponses(OpenAIServing):
model_name
,
tokenizer
,
request_metadata
,
)
except
Exception
as
e
:
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
"Should not reach here"
)
created_time
,
),
name
=
f
"create_
{
response
.
id
}
"
,
)
# For cleanup.
response_id
=
response
.
id
self
.
background_tasks
[
response_id
]
=
task
task
.
add_done_callback
(
lambda
_
:
self
.
background_tasks
.
pop
(
response_id
,
None
))
return
response
if
request
.
stream
:
return
self
.
responses_stream_generator
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
)
try
:
return
await
self
.
responses_full_generator
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
)
except
Exception
as
e
:
return
self
.
create_error_response
(
str
(
e
))
async
def
_make_request
(
self
,
...
...
@@ -408,6 +407,11 @@ class OpenAIServingResponses(OpenAIServing):
request
,
prev_response
)
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
EngineTokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
prompt_token_ids
],
[
engine_prompt
]
async
def
responses_full_generator
(
...
...
@@ -424,14 +428,16 @@ class OpenAIServingResponses(OpenAIServing):
if
created_time
is
None
:
created_time
=
int
(
time
.
time
())
try
:
async
for
_
in
result_generator
:
pass
except
asyncio
.
CancelledError
:
return
self
.
create_error_response
(
"Client disconnected"
)
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
async
with
AsyncExitStack
()
as
exit_stack
:
try
:
await
context
.
init_tool_sessions
(
self
.
tool_server
,
exit_stack
)
async
for
_
in
result_generator
:
pass
except
asyncio
.
CancelledError
:
return
self
.
create_error_response
(
"Client disconnected"
)
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
if
self
.
use_harmony
:
assert
isinstance
(
context
,
HarmonyContext
)
...
...
@@ -486,6 +492,51 @@ class OpenAIServingResponses(OpenAIServing):
self
.
response_store
[
response
.
id
]
=
response
return
response
def
_topk_logprobs
(
self
,
logprobs
:
dict
[
int
,
SampleLogprob
],
top_logprobs
:
int
,
tokenizer
:
AnyTokenizer
)
->
list
[
LogprobTopLogprob
]:
"""Returns the top-k logprobs from the logprobs dictionary."""
out
=
[]
for
i
,
(
token_id
,
_logprob
)
in
enumerate
(
logprobs
.
items
()):
if
i
>=
top_logprobs
:
break
text
=
_logprob
.
decoded_token
if
_logprob
.
decoded_token
\
is
not
None
else
tokenizer
.
decode
([
token_id
])
out
.
append
(
LogprobTopLogprob
(
token
=
text
,
logprob
=
max
(
_logprob
.
logprob
,
-
9999.0
),
bytes
=
list
(
text
.
encode
(
"utf-8"
,
errors
=
"replace"
)),
))
return
out
def
_create_response_logprobs
(
self
,
token_ids
:
Sequence
[
int
],
logprobs
:
Optional
[
SampleLogprobs
],
tokenizer
:
AnyTokenizer
,
top_logprobs
:
Optional
[
int
]
=
None
)
->
list
[
Logprob
]:
assert
logprobs
is
not
None
,
"logprobs must be provided"
assert
len
(
token_ids
)
==
len
(
logprobs
),
(
"token_ids and logprobs.token_ids must have the same length"
)
out
=
[]
for
i
,
token_id
in
enumerate
(
token_ids
):
logprob
=
logprobs
[
i
]
token_logprob
=
logprob
[
token_id
]
text
=
token_logprob
.
decoded_token
if
token_logprob
.
decoded_token
\
is
not
None
else
tokenizer
.
decode
([
token_id
])
out
.
append
(
Logprob
(
token
=
text
,
logprob
=
max
(
token_logprob
.
logprob
,
-
9999.0
),
bytes
=
list
(
text
.
encode
(
"utf-8"
,
errors
=
"replace"
)),
top_logprobs
=
self
.
_topk_logprobs
(
logprob
,
top_logprobs
=
top_logprobs
,
tokenizer
=
tokenizer
)
if
top_logprobs
else
[],
))
return
out
def
_make_response_output_items
(
self
,
request
:
ResponsesRequest
,
...
...
@@ -542,7 +593,12 @@ class OpenAIServingResponses(OpenAIServing):
text
=
content
,
annotations
=
[],
# TODO
type
=
"output_text"
,
logprobs
=
None
,
# TODO
logprobs
=
self
.
_create_response_logprobs
(
token_ids
=
final_output
.
token_ids
,
logprobs
=
final_output
.
logprobs
,
tokenizer
=
tokenizer
,
top_logprobs
=
request
.
top_logprobs
,
)
if
request
.
is_include_output_logprobs
()
else
None
,
)
message
=
ResponseOutputMessage
(
id
=
f
"msg_
{
random_uuid
()
}
"
,
...
...
@@ -773,7 +829,7 @@ class OpenAIServingResponses(OpenAIServing):
status_code
=
HTTPStatus
.
BAD_REQUEST
,
)
async
def
responses_stream_generator
(
async
def
_process_streaming_events
(
self
,
request
:
ResponsesRequest
,
sampling_params
:
SamplingParams
,
...
...
@@ -782,18 +838,8 @@ class OpenAIServingResponses(OpenAIServing):
model_name
:
str
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
created_time
:
Optional
[
int
]
=
None
,
created_time
:
int
,
)
->
AsyncGenerator
[
str
,
None
]:
# TODO:
# 1. Handle disconnect
if
not
isinstance
(
context
,
StreamingHarmonyContext
):
raise
NotImplementedError
(
"Streaming is not supported for responses API without Harmony."
)
created_time
=
created_time
or
int
(
time
.
time
())
sequence_number
=
0
def
_send_event
(
event
:
BaseModel
):
...
...
@@ -1004,7 +1050,48 @@ class OpenAIServingResponses(OpenAIServing):
delta
=
ctx
.
parser
.
last_content_delta
,
sequence_number
=-
1
,
))
# built-in tools will be triggered on the analysis channel
# However, occasionally built-in tools will
# still be output to commentary.
elif
(
ctx
.
parser
.
current_channel
==
"commentary"
or
ctx
.
parser
.
current_channel
==
"analysis"
)
and
ctx
.
parser
.
current_recipient
==
"python"
:
if
not
sent_output_item_added
:
sent_output_item_added
=
True
yield
_send_event
(
openai_responses_types
.
ResponseOutputItemAddedEvent
(
type
=
"response.output_item.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
openai_responses_types
.
ResponseCodeInterpreterToolCallParam
(
type
=
"code_interpreter_call"
,
id
=
current_item_id
,
code
=
None
,
container_id
=
"auto"
,
outputs
=
None
,
status
=
"in_progress"
,
),
))
yield
_send_event
(
openai_responses_types
.
ResponseCodeInterpreterCallInProgressEvent
(
type
=
"response.code_interpreter_call.in_progress"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
))
yield
_send_event
(
openai_responses_types
.
ResponseCodeInterpreterCallCodeDeltaEvent
(
type
=
"response.code_interpreter_call_code.delta"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
delta
=
ctx
.
parser
.
last_content_delta
,
))
if
ctx
.
is_assistant_action_turn
()
and
len
(
ctx
.
parser
.
messages
)
>
0
:
previous_item
=
ctx
.
parser
.
messages
[
-
1
]
if
(
self
.
tool_server
is
not
None
...
...
@@ -1100,30 +1187,6 @@ class OpenAIServingResponses(OpenAIServing):
and
self
.
tool_server
.
has_tool
(
"python"
)
and
previous_item
.
recipient
is
not
None
and
previous_item
.
recipient
.
startswith
(
"python"
)):
yield
_send_event
(
openai_responses_types
.
ResponseOutputItemAddedEvent
(
type
=
"response.output_item.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
openai_responses_types
.
ResponseCodeInterpreterToolCallParam
(
type
=
"code_interpreter_call"
,
id
=
current_item_id
,
code
=
""
,
container_id
=
"auto"
,
outputs
=
[],
status
=
"in_progress"
,
),
))
yield
_send_event
(
openai_responses_types
.
ResponseCodeInterpreterCallInProgressEvent
(
type
=
"response.code_interpreter_call.in_progress"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
))
# TODO: do we need to add delta event here?
yield
_send_event
(
openai_responses_types
.
ResponseCodeInterpreterCallCodeDoneEvent
(
...
...
@@ -1131,7 +1194,8 @@ class OpenAIServingResponses(OpenAIServing):
sequence_number
=-
1
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
code
=
previous_item
.
content
[
0
].
text
))
code
=
previous_item
.
content
[
0
].
text
,
))
yield
_send_event
(
openai_responses_types
.
ResponseCodeInterpreterCallInterpretingEvent
(
...
...
@@ -1187,3 +1251,31 @@ class OpenAIServingResponses(OpenAIServing):
sequence_number
=-
1
,
response
=
final_response
.
model_dump
(),
))
async
def
responses_stream_generator
(
self
,
request
:
ResponsesRequest
,
sampling_params
:
SamplingParams
,
result_generator
:
AsyncIterator
[
Optional
[
ConversationContext
]],
context
:
ConversationContext
,
model_name
:
str
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
created_time
:
Optional
[
int
]
=
None
,
)
->
AsyncGenerator
[
str
,
None
]:
# TODO:
# 1. Handle disconnect
if
not
isinstance
(
context
,
StreamingHarmonyContext
):
raise
NotImplementedError
(
"Streaming is not supported for responses API without Harmony."
)
created_time
=
created_time
or
int
(
time
.
time
())
async
with
AsyncExitStack
()
as
exit_stack
:
await
context
.
init_tool_sessions
(
self
.
tool_server
,
exit_stack
)
async
for
event_data
in
self
.
_process_streaming_events
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
):
yield
event_data
vllm/entrypoints/openai/serving_score.py
View file @
a99300bd
...
...
@@ -7,7 +7,6 @@ from typing import Any, Optional, Union
from
fastapi
import
Request
from
vllm
import
envs
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.logger
import
RequestLogger
...
...
@@ -47,11 +46,13 @@ class ServingScores(OpenAIServing):
models
:
OpenAIServingModels
,
*
,
request_logger
:
Optional
[
RequestLogger
],
log_error_stack
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
)
request_logger
=
request_logger
,
log_error_stack
=
log_error_stack
)
async
def
_embedding_score
(
self
,
...
...
@@ -227,8 +228,7 @@ class ServingScores(OpenAIServing):
params
=
default_pooling_params
,
lora_request
=
lora_request
)
if
envs
.
VLLM_USE_V1
and
(
token_type_ids
:
=
engine_prompt
.
pop
(
"token_type_ids"
,
None
)):
if
(
token_type_ids
:
=
engine_prompt
.
pop
(
"token_type_ids"
,
None
)):
pooling_params
=
default_pooling_params
.
clone
()
compressed
=
compress_token_type_ids
(
token_type_ids
)
pooling_params
.
extra_kwargs
=
{
...
...
@@ -266,12 +266,14 @@ class ServingScores(OpenAIServing):
request
:
Union
[
ScoreRequest
,
RerankRequest
],
request_id
:
str
,
raw_request
:
Optional
[
Request
]
=
None
,
truncate_prompt_tokens
:
Optional
[
int
]
=
None
,
)
->
Union
[
list
[
PoolingRequestOutput
],
ErrorResponse
]:
lora_request
=
self
.
_maybe_get_adapters
(
request
)
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
lora_request
)
truncate_prompt_tokens
=
getattr
(
request
,
"truncate_prompt_tokens"
,
None
)
tokenization_kwargs
:
dict
[
str
,
Any
]
=
{}
_validate_truncation_size
(
self
.
max_model_len
,
truncate_prompt_tokens
,
tokenization_kwargs
)
...
...
@@ -343,7 +345,6 @@ class ServingScores(OpenAIServing):
request
,
request_id
,
raw_request
,
request
.
truncate_prompt_tokens
,
)
if
isinstance
(
final_res_batch
,
ErrorResponse
):
return
final_res_batch
...
...
@@ -391,7 +392,6 @@ class ServingScores(OpenAIServing):
request
,
request_id
,
raw_request
,
request
.
truncate_prompt_tokens
,
)
if
isinstance
(
final_res_batch
,
ErrorResponse
):
return
final_res_batch
...
...
vllm/entrypoints/openai/serving_tokenization.py
View file @
a99300bd
...
...
@@ -39,11 +39,13 @@ class OpenAIServingTokenization(OpenAIServing):
request_logger
:
Optional
[
RequestLogger
],
chat_template
:
Optional
[
str
],
chat_template_content_format
:
ChatTemplateContentFormatOption
,
log_error_stack
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
)
request_logger
=
request_logger
,
log_error_stack
=
log_error_stack
)
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
...
...
vllm/entrypoints/openai/serving_transcription.py
View file @
a99300bd
...
...
@@ -32,13 +32,15 @@ class OpenAIServingTranscription(OpenAISpeechToText):
*
,
request_logger
:
Optional
[
RequestLogger
],
return_tokens_as_token_ids
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
):
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
task_type
=
"transcribe"
)
task_type
=
"transcribe"
,
log_error_stack
=
log_error_stack
)
async
def
create_transcription
(
self
,
audio_data
:
bytes
,
request
:
TranscriptionRequest
,
...
...
@@ -88,13 +90,15 @@ class OpenAIServingTranslation(OpenAISpeechToText):
*
,
request_logger
:
Optional
[
RequestLogger
],
return_tokens_as_token_ids
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
):
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
task_type
=
"translate"
)
task_type
=
"translate"
,
log_error_stack
=
log_error_stack
)
async
def
create_translation
(
self
,
audio_data
:
bytes
,
request
:
TranslationRequest
,
...
...
vllm/entrypoints/openai/speech_to_text.py
View file @
a99300bd
...
...
@@ -53,12 +53,14 @@ class OpenAISpeechToText(OpenAIServing):
request_logger
:
Optional
[
RequestLogger
],
return_tokens_as_token_ids
:
bool
=
False
,
task_type
:
Literal
[
"transcribe"
,
"translate"
]
=
"transcribe"
,
log_error_stack
:
bool
=
False
,
):
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
)
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
log_error_stack
=
log_error_stack
)
self
.
default_sampling_params
=
(
self
.
model_config
.
get_diff_sampling_param
())
...
...
@@ -200,7 +202,22 @@ class OpenAISpeechToText(OpenAIServing):
for
result_generator
in
list_result_generator
:
async
for
op
in
result_generator
:
text
+=
op
.
outputs
[
0
].
text
return
cast
(
T
,
response_class
(
text
=
text
))
if
self
.
task_type
==
"transcribe"
:
# add usage in TranscriptionResponse.
usage
=
{
"type"
:
"duration"
,
# rounded up as per openAI specs
"seconds"
:
int
(
math
.
ceil
(
duration_s
)),
}
final_response
=
cast
(
T
,
response_class
(
text
=
text
,
usage
=
usage
))
else
:
# no usage in response for translation task
final_response
=
cast
(
T
,
response_class
(
text
=
text
))
# type: ignore[call-arg]
return
final_response
except
asyncio
.
CancelledError
:
return
self
.
create_error_response
(
"Client disconnected"
)
except
ValueError
as
e
:
...
...
vllm/entrypoints/openai/tool_parsers/__init__.py
View file @
a99300bd
...
...
@@ -3,6 +3,7 @@
from
.abstract_tool_parser
import
ToolParser
,
ToolParserManager
from
.deepseekv3_tool_parser
import
DeepSeekV3ToolParser
from
.deepseekv31_tool_parser
import
DeepSeekV31ToolParser
from
.glm4_moe_tool_parser
import
Glm4MoeModelToolParser
from
.granite_20b_fc_tool_parser
import
Granite20bFCToolParser
from
.granite_tool_parser
import
GraniteToolParser
...
...
@@ -18,6 +19,7 @@ from .mistral_tool_parser import MistralToolParser
from
.phi4mini_tool_parser
import
Phi4MiniJsonToolParser
from
.pythonic_tool_parser
import
PythonicToolParser
from
.qwen3coder_tool_parser
import
Qwen3CoderToolParser
from
.seed_oss_tool_parser
import
SeedOssToolParser
from
.step3_tool_parser
import
Step3ToolParser
from
.xlam_tool_parser
import
xLAMToolParser
...
...
@@ -35,11 +37,13 @@ __all__ = [
"PythonicToolParser"
,
"Phi4MiniJsonToolParser"
,
"DeepSeekV3ToolParser"
,
"DeepSeekV31ToolParser"
,
"xLAMToolParser"
,
"MinimaxToolParser"
,
"KimiK2ToolParser"
,
"HunyuanA13BToolParser"
,
"Glm4MoeModelToolParser"
,
"Qwen3CoderToolParser"
,
"SeedOssToolParser"
,
"Step3ToolParser"
,
]
vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
from
typing
import
Union
import
regex
as
re
from
vllm.entrypoints.chat_utils
import
make_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
ExtractedToolCallInformation
,
FunctionCall
,
ToolCall
)
from
vllm.entrypoints.openai.tool_parsers.abstract_tool_parser
import
(
ToolParser
,
ToolParserManager
)
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
logger
=
init_logger
(
__name__
)
@
ToolParserManager
.
register_module
(
"deepseek_v31"
)
class
DeepSeekV31ToolParser
(
ToolParser
):
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
):
super
().
__init__
(
tokenizer
)
self
.
current_tool_name_sent
:
bool
=
False
self
.
prev_tool_call_arr
:
list
[
dict
]
=
[]
self
.
current_tool_id
:
int
=
-
1
self
.
streamed_args_for_tool
:
list
[
str
]
=
(
[])
# map what has been streamed for each tool so far to a list
self
.
tool_calls_start_token
:
str
=
"<|tool▁calls▁begin|>"
self
.
tool_calls_end_token
:
str
=
"<|tool▁calls▁end|>"
self
.
tool_call_start_token
:
str
=
"<|tool▁call▁begin|>"
self
.
tool_call_end_token
:
str
=
"<|tool▁call▁end|>"
self
.
tool_call_regex
=
re
.
compile
(
r
"<|tool▁call▁begin|>(?P<function_name>.*)<|tool▁sep|>(?P<function_arguments>.*)<|tool▁call▁end|>"
)
self
.
stream_tool_call_portion_regex
=
re
.
compile
(
r
"(?P<function_name>.*)<|tool▁sep|>(?P<function_arguments>.*)"
)
self
.
stream_tool_call_name_regex
=
re
.
compile
(
r
"(?P<function_name>.*)<|tool▁sep|>"
)
if
not
self
.
model_tokenizer
:
raise
ValueError
(
"The model tokenizer must be passed to the ToolParser "
"constructor during construction."
)
self
.
tool_calls_start_token_id
=
self
.
vocab
.
get
(
self
.
tool_calls_start_token
)
self
.
tool_calls_end_token_id
=
self
.
vocab
.
get
(
self
.
tool_calls_end_token
)
self
.
tool_call_start_token_id
=
self
.
vocab
.
get
(
self
.
tool_call_start_token
)
self
.
tool_call_end_token_id
=
self
.
vocab
.
get
(
self
.
tool_call_end_token
)
if
(
self
.
tool_calls_start_token_id
is
None
or
self
.
tool_calls_end_token_id
is
None
):
raise
RuntimeError
(
"DeepSeek-V3.1 Tool parser could not locate tool call "
"start/end tokens in the tokenizer!"
)
def
extract_tool_calls
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
,
)
->
ExtractedToolCallInformation
:
# sanity check; avoid unnecessary processing
if
self
.
tool_calls_start_token
not
in
model_output
:
return
ExtractedToolCallInformation
(
tools_called
=
False
,
tool_calls
=
[],
content
=
model_output
)
else
:
try
:
# there are two possible captures - between tags, or between a
# tag and end-of-string so the result of
# findall is an array of tuples where one is a function call and
# the other is None
function_call_tuples
=
self
.
tool_call_regex
.
findall
(
model_output
)
tool_calls
=
[]
for
match
in
function_call_tuples
:
function_name
,
function_args
=
match
tool_calls
.
append
(
ToolCall
(
type
=
"function"
,
function
=
FunctionCall
(
name
=
function_name
,
arguments
=
function_args
),
))
content
=
model_output
[:
model_output
.
find
(
self
.
tool_calls_start_token
)]
return
ExtractedToolCallInformation
(
tools_called
=
True
,
tool_calls
=
tool_calls
,
content
=
content
if
content
else
None
,
)
except
Exception
:
logger
.
exception
(
"Error in extracting tool call from response."
)
return
ExtractedToolCallInformation
(
tools_called
=
False
,
tool_calls
=
[],
content
=
model_output
)
def
extract_tool_calls_streaming
(
self
,
previous_text
:
str
,
current_text
:
str
,
delta_text
:
str
,
previous_token_ids
:
Sequence
[
int
],
current_token_ids
:
Sequence
[
int
],
delta_token_ids
:
Sequence
[
int
],
request
:
ChatCompletionRequest
,
)
->
Union
[
DeltaMessage
,
None
]:
logger
.
debug
(
"delta_text: %s"
,
delta_text
)
logger
.
debug
(
"delta_token_ids: %s"
,
delta_token_ids
)
# check to see if we should be streaming a tool call - is there a
if
self
.
tool_calls_start_token_id
not
in
current_token_ids
:
logger
.
debug
(
"No tool call tokens found!"
)
return
DeltaMessage
(
content
=
delta_text
)
delta_text
=
delta_text
.
replace
(
self
.
tool_calls_start_token
,
""
).
replace
(
self
.
tool_calls_end_token
,
""
)
try
:
# figure out where we are in the parsing by counting tool call
# start & end tags
prev_tool_start_count
=
previous_token_ids
.
count
(
self
.
tool_call_start_token_id
)
prev_tool_end_count
=
previous_token_ids
.
count
(
self
.
tool_call_end_token_id
)
cur_tool_start_count
=
current_token_ids
.
count
(
self
.
tool_call_start_token_id
)
cur_tool_end_count
=
current_token_ids
.
count
(
self
.
tool_call_end_token_id
)
tool_call_portion
=
None
text_portion
=
None
# case: if we're generating text, OR rounding out a tool call
if
(
cur_tool_start_count
==
cur_tool_end_count
and
prev_tool_end_count
==
cur_tool_end_count
and
self
.
tool_call_end_token
not
in
delta_text
):
logger
.
debug
(
"Generating text content! skipping tool parsing."
)
return
DeltaMessage
(
content
=
delta_text
)
if
self
.
tool_call_end_token
in
delta_text
:
logger
.
debug
(
"tool_call_end_token in delta_text"
)
full_text
=
current_text
+
delta_text
tool_call_portion
=
full_text
.
split
(
self
.
tool_call_start_token
)[
-
1
].
split
(
self
.
tool_call_end_token
)[
0
].
rstrip
()
delta_text
=
delta_text
.
split
(
self
.
tool_call_end_token
)[
0
].
rstrip
()
text_portion
=
delta_text
.
split
(
self
.
tool_call_end_token
)[
-
1
].
lstrip
()
# case -- we're starting a new tool call
if
(
cur_tool_start_count
>
cur_tool_end_count
and
cur_tool_start_count
>
prev_tool_start_count
):
if
len
(
delta_token_ids
)
>
1
:
tool_call_portion
=
current_text
.
split
(
self
.
tool_call_start_token
)[
-
1
]
else
:
tool_call_portion
=
None
delta
=
None
text_portion
=
None
# set cursors and state appropriately
self
.
current_tool_id
+=
1
self
.
current_tool_name_sent
=
False
self
.
streamed_args_for_tool
.
append
(
""
)
logger
.
debug
(
"Starting on a new tool %s"
,
self
.
current_tool_id
)
# case -- we're updating an existing tool call
elif
(
cur_tool_start_count
>
cur_tool_end_count
and
cur_tool_start_count
==
prev_tool_start_count
):
# get the portion of the text that's the tool call
tool_call_portion
=
current_text
.
split
(
self
.
tool_call_start_token
)[
-
1
]
text_portion
=
None
# case -- the current tool call is being closed.
elif
(
cur_tool_start_count
==
cur_tool_end_count
and
cur_tool_end_count
>=
prev_tool_end_count
):
if
self
.
prev_tool_call_arr
is
None
or
len
(
self
.
prev_tool_call_arr
)
==
0
:
logger
.
debug
(
"attempting to close tool call, but no tool call"
)
return
None
diff
=
self
.
prev_tool_call_arr
[
self
.
current_tool_id
].
get
(
"arguments"
)
if
diff
:
diff
=
(
diff
.
encode
(
"utf-8"
).
decode
(
"unicode_escape"
)
if
diff
is
str
else
diff
)
if
'"}'
not
in
delta_text
:
return
None
end_loc
=
delta_text
.
rindex
(
'"}'
)
diff
=
delta_text
[:
end_loc
]
+
'"}'
logger
.
debug
(
"Finishing tool and found diff that had not "
"been streamed yet: %s"
,
diff
,
)
self
.
streamed_args_for_tool
[
self
.
current_tool_id
]
+=
diff
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
function
=
DeltaFunctionCall
(
arguments
=
diff
).
model_dump
(
exclude_none
=
True
),
)
])
# case -- otherwise we're just generating text
else
:
text
=
delta_text
.
replace
(
self
.
tool_call_start_token
,
""
)
text
=
text
.
replace
(
self
.
tool_call_end_token
,
""
)
delta
=
DeltaMessage
(
tool_calls
=
[],
content
=
text
)
return
delta
current_tool_call
=
dict
()
if
tool_call_portion
:
current_tool_call_matches
=
(
self
.
stream_tool_call_portion_regex
.
match
(
tool_call_portion
))
if
current_tool_call_matches
:
tool_name
,
tool_args
=
current_tool_call_matches
.
groups
()
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"arguments"
]
=
tool_args
else
:
current_tool_call_name_matches
=
(
self
.
stream_tool_call_name_regex
.
match
(
tool_call_portion
))
if
current_tool_call_name_matches
:
tool_name
=
current_tool_call_name_matches
.
groups
()
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"arguments"
]
=
""
else
:
logger
.
debug
(
"Not enough token"
)
return
None
# case - we haven't sent the tool name yet. If it's available, send
# it. otherwise, wait until it's available.
if
not
self
.
current_tool_name_sent
:
if
current_tool_call
is
None
:
return
None
function_name
:
Union
[
str
,
None
]
=
current_tool_call
.
get
(
"name"
)
if
function_name
:
self
.
current_tool_name_sent
=
True
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
make_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
),
)
])
else
:
return
None
# case -- otherwise, send the tool call delta
# if the tool call portion is None, send the delta as text
if
tool_call_portion
is
None
:
# if there's text but not tool calls, send that -
# otherwise None to skip chunk
delta
=
(
DeltaMessage
(
content
=
delta_text
)
if
text_portion
is
not
None
else
None
)
return
delta
# now, the nitty-gritty of tool calls
# now we have the portion to parse as tool call.
logger
.
debug
(
"Trying to parse current tool call with ID %s"
,
self
.
current_tool_id
)
# if we're starting a new tool call, push an empty object in as
# a placeholder for the arguments
if
len
(
self
.
prev_tool_call_arr
)
<=
self
.
current_tool_id
:
self
.
prev_tool_call_arr
.
append
({})
# main logic for tool parsing here - compare prev. partially-parsed
# JSON to the current partially-parsed JSON
prev_arguments
=
self
.
prev_tool_call_arr
[
self
.
current_tool_id
].
get
(
"arguments"
)
cur_arguments
=
current_tool_call
.
get
(
"arguments"
)
logger
.
debug
(
"diffing old arguments: %s"
,
prev_arguments
)
logger
.
debug
(
"against new ones: %s"
,
cur_arguments
)
# case -- no arguments have been created yet. skip sending a delta.
if
not
cur_arguments
and
not
prev_arguments
:
logger
.
debug
(
"Skipping text %s - no arguments"
,
delta_text
)
delta
=
None
# case -- prev arguments are defined, but non are now.
# probably impossible, but not a fatal error - just keep going
elif
not
cur_arguments
and
prev_arguments
:
logger
.
error
(
"should be impossible to have arguments reset "
"mid-call. skipping streaming anything."
)
delta
=
None
# case -- we now have the first info about arguments available from
# autocompleting the JSON
elif
cur_arguments
and
not
prev_arguments
:
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
function
=
DeltaFunctionCall
(
arguments
=
cur_arguments
).
model_dump
(
exclude_none
=
True
),
)
])
self
.
streamed_args_for_tool
[
self
.
current_tool_id
]
=
cur_arguments
# last case -- we have an update to existing arguments.
elif
cur_arguments
and
prev_arguments
:
if
(
isinstance
(
delta_text
,
str
)
and
cur_arguments
!=
prev_arguments
and
len
(
cur_arguments
)
>
len
(
prev_arguments
)
and
cur_arguments
.
startswith
(
prev_arguments
)):
delta_arguments
=
cur_arguments
[
len
(
prev_arguments
):]
logger
.
debug
(
"got diff %s"
,
delta_text
)
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
function
=
DeltaFunctionCall
(
arguments
=
delta_arguments
).
model_dump
(
exclude_none
=
True
),
)
])
self
.
streamed_args_for_tool
[
self
.
current_tool_id
]
=
cur_arguments
else
:
delta
=
None
# handle saving the state for the current tool into
# the "prev" list for use in diffing for the next iteration
if
self
.
current_tool_id
==
len
(
self
.
prev_tool_call_arr
)
-
1
:
self
.
prev_tool_call_arr
[
self
.
current_tool_id
]
=
current_tool_call
else
:
self
.
prev_tool_call_arr
.
append
(
current_tool_call
)
return
delta
except
Exception
:
logger
.
exception
(
"Error trying to handle streaming tool call."
)
return
None
# do not stream a delta. skip this token ID.
vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
View file @
a99300bd
...
...
@@ -6,7 +6,7 @@ from typing import Union
import
regex
as
re
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
),
...
...
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
View file @
a99300bd
...
...
@@ -10,7 +10,7 @@ import partial_json_parser
import
regex
as
re
from
partial_json_parser.core.options
import
Allow
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -203,7 +203,7 @@ class Granite20bFCToolParser(ToolParser):
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
))
...
...
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
View file @
a99300bd
...
...
@@ -8,7 +8,7 @@ from typing import Union
import
partial_json_parser
from
partial_json_parser.core.options
import
Allow
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -185,7 +185,7 @@ class GraniteToolParser(ToolParser):
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
))
...
...
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
View file @
a99300bd
...
...
@@ -9,7 +9,7 @@ import partial_json_parser
import
regex
as
re
from
partial_json_parser.core.options
import
Allow
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -307,7 +307,7 @@ class Hermes2ProToolParser(ToolParser):
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
))
...
...
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
View file @
a99300bd
...
...
@@ -8,7 +8,7 @@ from typing import Union
import
partial_json_parser
from
partial_json_parser.core.options
import
Allow
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -107,7 +107,7 @@ class Internlm2ToolParser(ToolParser):
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
))
...
...
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
View file @
a99300bd
...
...
@@ -9,7 +9,7 @@ import partial_json_parser
import
regex
as
re
from
partial_json_parser.core.options
import
Allow
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -222,7 +222,7 @@ class JambaToolParser(ToolParser):
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
))
...
...
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
View file @
a99300bd
...
...
@@ -10,7 +10,7 @@ import regex as re
from
partial_json_parser.core.options
import
Allow
from
transformers
import
PreTrainedTokenizerBase
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -213,7 +213,7 @@ class Llama3JsonToolParser(ToolParser):
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_id
,
type
=
"function"
,
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
function
=
DeltaFunctionCall
(
name
=
function_name
).
model_dump
(
exclude_none
=
True
))
...
...
vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
View file @
a99300bd
...
...
@@ -7,7 +7,7 @@ from typing import Any, Optional, Union
import
regex
as
re
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -394,7 +394,7 @@ class MinimaxToolParser(ToolParser):
sent_tools
.
append
({
"sent_name"
:
False
,
"sent_arguments"
:
""
,
"id"
:
random
_tool_call_id
(),
"id"
:
make
_tool_call_id
(),
})
while
len
(
tool_ids
)
<
tool_count
:
...
...
@@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
i
+=
1
return
boundaries
def
_extract_tool_args
(
self
,
tool_content
:
str
,
args_match
)
->
str
:
def
_extract_tool_args
(
self
,
tool_content
:
str
,
args_match
:
re
.
Match
[
str
])
->
str
:
"""
Extract tool arguments from tool content.
...
...
vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
View file @
a99300bd
...
...
@@ -8,7 +8,7 @@ from typing import Any, Optional
import
regex
as
re
from
transformers
import
PreTrainedTokenizerBase
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
,
ExtractedToolCallInformation
,
...
...
@@ -74,7 +74,7 @@ class Phi4MiniJsonToolParser(ToolParser):
tool_calls
:
list
[
ToolCall
]
=
[
ToolCall
(
id
=
random
_tool_call_id
(),
id
=
make
_tool_call_id
(),
type
=
"function"
,
function
=
FunctionCall
(
name
=
raw_function_call
[
"name"
],
...
...
vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
ast
import
json
import
uuid
from
collections.abc
import
Sequence
...
...
@@ -22,7 +22,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
logger
=
init_logger
(
__name__
)
@
ToolParserManager
.
register_module
(
[
"qwen3_coder"
]
)
@
ToolParserManager
.
register_module
(
"qwen3_coder"
)
class
Qwen3CoderToolParser
(
ToolParser
):
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
):
...
...
@@ -30,6 +30,8 @@ class Qwen3CoderToolParser(ToolParser):
self
.
current_tool_name_sent
:
bool
=
False
self
.
prev_tool_call_arr
:
list
[
dict
]
=
[]
# Override base class type - we use string IDs for tool calls
self
.
current_tool_id
:
Optional
[
str
]
=
None
# type: ignore
self
.
streamed_args_for_tool
:
list
[
str
]
=
[]
# Sentinel tokens for streaming mode
...
...
@@ -42,20 +44,6 @@ class Qwen3CoderToolParser(ToolParser):
self
.
is_tool_call_started
:
bool
=
False
self
.
failed_count
:
int
=
0
# Streaming state variables
self
.
current_tool_index
:
int
=
0
self
.
header_sent
:
bool
=
False
self
.
current_tool_string_id
:
Optional
[
str
]
=
None
self
.
current_function_name
:
Optional
[
str
]
=
None
self
.
current_param_name
:
Optional
[
str
]
=
None
self
.
current_param_value
:
str
=
""
self
.
param_count
:
int
=
0
self
.
in_param
:
bool
=
False
self
.
in_function
:
bool
=
False
self
.
accumulated_text
:
str
=
""
self
.
json_started
:
bool
=
False
self
.
json_closed
:
bool
=
False
# Enhanced streaming state - reset for each new message
self
.
_reset_streaming_state
()
...
...
@@ -67,7 +55,8 @@ class Qwen3CoderToolParser(ToolParser):
self
.
tool_call_function_regex
=
re
.
compile
(
r
"<function=(.*?)</function>|<function=(.*)$"
,
re
.
DOTALL
)
self
.
tool_call_parameter_regex
=
re
.
compile
(
r
"<parameter=(.*?)</parameter>|<parameter=(.*?)$"
,
re
.
DOTALL
)
r
"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)"
,
re
.
DOTALL
)
if
not
self
.
model_tokenizer
:
raise
ValueError
(
...
...
@@ -84,8 +73,8 @@ class Qwen3CoderToolParser(ToolParser):
"Qwen3 XML Tool parser could not locate tool call start/end "
"tokens in the tokenizer!"
)
logger
.
debug
(
"vLLM Successfully import tool parser %s !"
,
self
.
__class__
.
__name__
)
logger
.
info
(
"vLLM Successfully import tool parser %s !"
,
self
.
__class__
.
__name__
)
def
_generate_tool_call_id
(
self
)
->
str
:
"""Generate a unique tool call ID."""
...
...
@@ -96,7 +85,7 @@ class Qwen3CoderToolParser(ToolParser):
self
.
current_tool_index
=
0
self
.
is_tool_call_started
=
False
self
.
header_sent
=
False
self
.
current_tool_
string_
id
=
None
self
.
current_tool_id
=
None
self
.
current_function_name
=
None
self
.
current_param_name
=
None
self
.
current_param_value
=
""
...
...
@@ -106,122 +95,122 @@ class Qwen3CoderToolParser(ToolParser):
self
.
accumulated_text
=
""
self
.
json_started
=
False
self
.
json_closed
=
False
def
_parse_xml_function_call
(
self
,
function_call_str
:
str
,
tools
:
Optional
[
list
[
ChatCompletionToolsParam
]]
)
->
Optional
[
ToolCall
]:
def
get_arguments_config
(
func_name
:
str
)
->
dict
:
if
tools
is
None
:
return
{}
for
config
in
tools
:
if
not
hasattr
(
config
,
"type"
)
or
not
(
hasattr
(
config
,
"function"
)
and
hasattr
(
config
.
function
,
"name"
)):
continue
if
(
config
.
type
==
"function"
and
config
.
function
.
name
==
func_name
):
if
not
hasattr
(
config
.
function
,
"parameters"
):
return
{}
params
=
config
.
function
.
parameters
if
isinstance
(
params
,
dict
)
and
"properties"
in
params
:
return
params
[
"properties"
]
elif
isinstance
(
params
,
dict
):
return
params
else
:
return
{}
logger
.
warning
(
"Tool '%s' is not defined in the tools list."
,
func_name
)
# Store accumulated parameters for type conversion
self
.
accumulated_params
=
{}
self
.
streaming_request
=
None
def
_get_arguments_config
(
self
,
func_name
:
str
,
tools
:
Optional
[
list
[
ChatCompletionToolsParam
]])
->
dict
:
"""Extract argument configuration for a function."""
if
tools
is
None
:
return
{}
for
config
in
tools
:
if
not
hasattr
(
config
,
"type"
)
or
not
(
hasattr
(
config
,
"function"
)
and
hasattr
(
config
.
function
,
"name"
)):
continue
if
config
.
type
==
"function"
and
config
.
function
.
name
==
func_name
:
if
not
hasattr
(
config
.
function
,
"parameters"
):
return
{}
params
=
config
.
function
.
parameters
if
isinstance
(
params
,
dict
)
and
"properties"
in
params
:
return
params
[
"properties"
]
elif
isinstance
(
params
,
dict
):
return
params
else
:
return
{}
logger
.
warning
(
"Tool '%s' is not defined in the tools list."
,
func_name
)
return
{}
def
_convert_param_value
(
self
,
param_value
:
str
,
param_name
:
str
,
param_config
:
dict
,
func_name
:
str
)
->
Any
:
"""Convert parameter value based on its type in the schema."""
# Handle null value for any type
if
param_value
.
lower
()
==
"null"
:
return
None
def
convert_param_value
(
param_value
:
str
,
param_name
:
str
,
param_config
:
dict
,
func_name
:
str
)
->
Any
:
# Handle null value for any type
if
param_value
.
lower
()
==
"null"
:
return
None
converted_value
:
Any
if
param_name
not
in
param_config
:
if
param_config
!=
{}:
logger
.
warning
(
"Parsed parameter '%s' is not defined in the tool "
"parameters for tool '%s', directly returning the "
"string value."
,
param_name
,
func_name
)
return
param_value
if
(
isinstance
(
param_config
[
param_name
],
dict
)
and
"type"
in
param_config
[
param_name
]):
param_type
=
str
(
param_config
[
param_name
][
"type"
]).
strip
().
lower
()
else
:
param_type
=
"string"
if
param_type
in
[
"string"
,
"str"
,
"text"
,
"varchar"
,
"char"
,
"enum"
]:
if
param_name
not
in
param_config
:
if
param_config
!=
{}:
logger
.
warning
(
"Parsed parameter '%s' is not defined in the tool "
"parameters for tool '%s', directly returning the "
"string value."
,
param_name
,
func_name
)
return
param_value
if
isinstance
(
param_config
[
param_name
],
dict
)
and
"type"
in
param_config
[
param_name
]:
param_type
=
str
(
param_config
[
param_name
][
"type"
]).
strip
().
lower
()
else
:
param_type
=
"string"
if
param_type
in
[
"string"
,
"str"
,
"text"
,
"varchar"
,
"char"
,
"enum"
]:
return
param_value
elif
param_type
.
startswith
(
"int"
)
or
param_type
.
startswith
(
"uint"
)
or
param_type
.
startswith
(
"long"
)
or
param_type
.
startswith
(
"short"
)
or
param_type
.
startswith
(
"unsigned"
):
try
:
return
int
(
param_value
)
except
(
ValueError
,
TypeError
):
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not an "
"integer in tool '%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
elif
(
param_type
.
startswith
(
"int"
)
or
param_type
.
startswith
(
"uint"
)
or
param_type
.
startswith
(
"long"
)
or
param_type
.
startswith
(
"short"
)
or
param_type
.
startswith
(
"unsigned"
)):
try
:
converted_value
=
int
(
param_value
)
return
converted_value
except
ValueError
:
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not an "
"integer in tool '%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
elif
param_type
.
startswith
(
"num"
)
or
param_type
.
startswith
(
"float"
):
try
:
float_param_value
=
float
(
param_value
)
return
float_param_value
if
float_param_value
-
int
(
float_param_value
)
!=
0
else
int
(
float_param_value
)
except
(
ValueError
,
TypeError
):
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a float "
"in tool '%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
elif
(
param_type
.
startswith
(
"num"
)
or
param_type
.
startswith
(
"float"
)):
elif
param_type
in
[
"boolean"
,
"bool"
,
"binary"
]:
param_value
=
param_value
.
lower
()
if
param_value
not
in
[
"true"
,
"false"
]:
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a boolean "
"(`true` or `false`) in tool '%s', degenerating to "
"false."
,
param_value
,
param_name
,
func_name
)
return
param_value
==
"true"
else
:
if
param_type
in
[
"object"
,
"array"
,
"arr"
]
or
param_type
.
startswith
(
"dict"
)
or
param_type
.
startswith
(
"list"
):
try
:
float_param_value
=
float
(
param_value
)
converted_value
=
(
float_param_value
if
float_param_value
-
int
(
float_param_value
)
!=
0
else
int
(
float_param_value
))
return
converted_value
except
ValueError
:
param_value
=
json
.
loads
(
param_value
)
return
param_value
except
(
json
.
JSONDecodeError
,
TypeError
,
ValueError
):
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a float "
"in tool '%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
elif
param_type
in
[
"boolean"
,
"bool"
,
"binary"
]:
param_value
=
param_value
.
lower
()
if
param_value
not
in
[
"true"
,
"false"
]:
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a "
"boolean (`true` of `false`) in tool '%s', "
"degenerating to false."
,
param_value
,
param_name
,
"Parsed value '%s' of parameter '%s' cannot be "
"parsed with json.loads in tool '%s', will try "
"other methods to parse it."
,
param_value
,
param_name
,
func_name
)
return
param_value
==
"true"
else
:
if
param_type
==
"object"
or
param_type
.
startswith
(
"dict"
):
try
:
converted_value
=
json
.
loads
(
param_value
)
return
converted_value
except
json
.
JSONDecodeError
:
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a "
"valid JSON object in tool '%s', will try other "
"methods to parse it."
,
param_value
,
param_name
,
func_name
)
try
:
param_value
=
ast
.
literal_eval
(
param_value
)
# safer
except
(
ValueError
,
SyntaxError
,
TypeError
):
logger
.
warning
(
"Parameter '%s' has unknown type '%s'. "
"The value will be treated as a string."
,
param_name
,
param_type
)
return
param_value
"Parsed value '%s' of parameter '%s' cannot be "
"converted via Python `ast.literal_eval()` in tool "
"'%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
def
_parse_xml_function_call
(
self
,
function_call_str
:
str
,
tools
:
Optional
[
list
[
ChatCompletionToolsParam
]]
)
->
Optional
[
ToolCall
]:
# Extract function name
end_index
=
function_call_str
.
index
(
">"
)
function_name
=
function_call_str
[:
end_index
]
param_config
=
get_arguments_config
(
function_name
)
param_config
=
self
.
_
get_arguments_config
(
function_name
,
tools
)
parameters
=
function_call_str
[
end_index
+
1
:]
param_dict
=
{}
for
match
in
self
.
tool_call_parameter_regex
.
findall
(
parameters
):
match_text
=
match
[
0
]
if
match
[
0
]
else
match
[
1
]
for
match_text
in
self
.
tool_call_parameter_regex
.
findall
(
parameters
):
idx
=
match_text
.
index
(
">"
)
param_name
=
match_text
[:
idx
]
param_value
=
str
(
match_text
[
idx
+
1
:])
...
...
@@ -231,7 +220,7 @@ class Qwen3CoderToolParser(ToolParser):
if
param_value
.
endswith
(
"
\n
"
):
param_value
=
param_value
[:
-
1
]
param_dict
[
param_name
]
=
convert_param_value
(
param_dict
[
param_name
]
=
self
.
_
convert_param_value
(
param_value
,
param_name
,
param_config
,
function_name
)
return
ToolCall
(
type
=
"function"
,
...
...
@@ -284,8 +273,7 @@ class Qwen3CoderToolParser(ToolParser):
for
function_call_str
in
function_calls
]
# Populate prev_tool_call_arr for serving layer to set
# finish_reason
# Populate prev_tool_call_arr for serving layer to set finish_reason
self
.
prev_tool_call_arr
.
clear
()
# Clear previous calls
for
tool_call
in
tool_calls
:
if
tool_call
:
...
...
@@ -298,8 +286,8 @@ class Qwen3CoderToolParser(ToolParser):
# Extract content before tool calls
content_index
=
model_output
.
find
(
self
.
tool_call_start_token
)
content_index
=
(
content_index
if
content_index
>=
0
else
model_output
.
find
(
self
.
tool_call_prefix
))
idx
=
model_output
.
find
(
self
.
tool_call_prefix
)
content_index
=
content_index
if
content_index
>=
0
else
idx
content
=
model_output
[:
content_index
]
# .rstrip()
return
ExtractedToolCallInformation
(
...
...
@@ -324,13 +312,16 @@ class Qwen3CoderToolParser(ToolParser):
delta_token_ids
:
Sequence
[
int
],
request
:
ChatCompletionRequest
,
)
->
Union
[
DeltaMessage
,
None
]:
# If no delta text, return None unless it's an EOS token after tool
# calls
# Store request for type conversion
if
not
previous_text
:
self
.
_reset_streaming_state
()
self
.
streaming_request
=
request
# If no delta text, return None unless it's an EOS token after tools
if
not
delta_text
:
# Check if this is an EOS token after all tool calls are complete
# We check for tool calls in the text even if is_tool_call_started
# is False because it might have been reset after processing all
# tools
# Check for tool calls in text even if is_tool_call_started
# is False (might have been reset after processing all tools)
if
(
delta_token_ids
and
self
.
tool_call_end_token_id
not
in
delta_token_ids
):
# Count complete tool calls
...
...
@@ -339,24 +330,19 @@ class Qwen3CoderToolParser(ToolParser):
# If we have completed tool calls and populated
# prev_tool_call_arr
if
(
complete_calls
>
0
and
len
(
self
.
prev_tool_call_arr
)
>
0
)
:
if
complete_calls
>
0
and
len
(
self
.
prev_tool_call_arr
)
>
0
:
# Check if all tool calls are closed
open_calls
=
(
current_text
.
count
(
self
.
tool_call_start_token
)
-
current_text
.
count
(
self
.
tool_call_end_token
)
)
open_calls
=
current_text
.
count
(
self
.
tool_call_start_token
)
-
current_text
.
count
(
self
.
tool_call_end_token
)
if
open_calls
==
0
:
# Return empty delta message to allow finish_reason
# processing
# Return empty delta for finish_reason processing
return
DeltaMessage
(
content
=
""
)
elif
not
self
.
is_tool_call_started
and
current_text
:
# This is a regular content response that's now complete
return
DeltaMessage
(
content
=
""
)
return
None
# Check if this is the first call (reset state if needed)
if
not
previous_text
:
self
.
_reset_streaming_state
()
# Update accumulated text
self
.
accumulated_text
=
current_text
...
...
@@ -371,11 +357,11 @@ class Qwen3CoderToolParser(ToolParser):
self
.
param_count
=
0
self
.
json_started
=
False
self
.
json_closed
=
False
self
.
accumulated_params
=
{}
# Check if there are more tool calls
tool_starts_count
=
current_text
.
count
(
self
.
tool_call_start_token
)
if
self
.
current_tool_index
>=
tool_starts_count
:
tool_starts
=
current_text
.
count
(
self
.
tool_call_start_token
)
if
self
.
current_tool_index
>=
tool_starts
:
# No more tool calls
self
.
is_tool_call_started
=
False
# Continue processing next tool
...
...
@@ -412,20 +398,20 @@ class Qwen3CoderToolParser(ToolParser):
# We're in a tool call, find the current tool call portion
# Need to find the correct tool call based on current_tool_index
tool_starts
:
list
[
int
]
=
[]
tool_start
_position
s
:
list
[
int
]
=
[]
idx
=
0
while
True
:
idx
=
current_text
.
find
(
self
.
tool_call_start_token
,
idx
)
if
idx
==
-
1
:
break
tool_starts
.
append
(
idx
)
tool_start
_position
s
.
append
(
idx
)
idx
+=
len
(
self
.
tool_call_start_token
)
if
self
.
current_tool_index
>=
len
(
tool_starts
):
if
self
.
current_tool_index
>=
len
(
tool_start
_position
s
):
# No more tool calls to process yet
return
None
tool_start_idx
=
tool_starts
[
self
.
current_tool_index
]
tool_start_idx
=
tool_start
_position
s
[
self
.
current_tool_index
]
# Find where this tool call ends (or current position if not ended yet)
tool_end_idx
=
current_text
.
find
(
self
.
tool_call_end_token
,
tool_start_idx
)
...
...
@@ -438,19 +424,19 @@ class Qwen3CoderToolParser(ToolParser):
# Looking for function header
if
not
self
.
header_sent
:
if
self
.
tool_call_prefix
in
tool_text
:
func_start
=
(
tool_text
.
find
(
self
.
tool_call_prefix
)
+
len
(
self
.
tool_call_prefix
)
)
func_start
=
tool_text
.
find
(
self
.
tool_call_prefix
)
+
len
(
self
.
tool_call_prefix
)
func_end
=
tool_text
.
find
(
">"
,
func_start
)
if
func_end
!=
-
1
:
# Found complete function name
self
.
current_function_name
=
tool_text
[
func_start
:
func_end
]
self
.
current_tool_
string_
id
=
self
.
_generate_tool_call_id
()
self
.
current_tool_id
=
self
.
_generate_tool_call_id
()
self
.
header_sent
=
True
self
.
in_function
=
True
# IMPORTANT: Add to prev_tool_call_arr immediately when
we
# detect a tool call. This ensures
# IMPORTANT: Add to prev_tool_call_arr immediately when
#
we
detect a tool call. This ensures
# finish_reason="tool_calls" even if parsing isn't complete
already_added
=
any
(
tool
.
get
(
"name"
)
==
self
.
current_function_name
...
...
@@ -466,7 +452,7 @@ class Qwen3CoderToolParser(ToolParser):
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
id
=
self
.
current_tool_
string_
id
,
id
=
self
.
current_tool_id
,
function
=
DeltaFunctionCall
(
name
=
self
.
current_function_name
,
arguments
=
""
),
type
=
"function"
,
...
...
@@ -496,10 +482,11 @@ class Qwen3CoderToolParser(ToolParser):
# Close JSON
self
.
json_closed
=
True
# Extract the complete tool call to update prev_tool_call_arr
# with final arguments. Find the function content
func_start
=
(
tool_text
.
find
(
self
.
tool_call_prefix
)
+
len
(
self
.
tool_call_prefix
))
# Extract complete tool call to update
# prev_tool_call_arr with final arguments
# Find the function content
func_start
=
tool_text
.
find
(
self
.
tool_call_prefix
)
+
len
(
self
.
tool_call_prefix
)
func_content_end
=
tool_text
.
find
(
self
.
function_end_token
,
func_start
)
if
func_content_end
!=
-
1
:
...
...
@@ -507,15 +494,17 @@ class Qwen3CoderToolParser(ToolParser):
# Parse to get the complete arguments
try
:
parsed_tool
=
self
.
_parse_xml_function_call
(
func_content
,
request
.
tools
if
request
else
None
)
func_content
,
self
.
streaming_request
.
tools
if
self
.
streaming_request
else
None
)
if
parsed_tool
:
# Update existing entry in
prev_tool_call_arr with
# complete arg
ument
s
# Update existing entry in
#
prev_tool_call_arr with
complete args
for
i
,
tool
in
enumerate
(
self
.
prev_tool_call_arr
):
if
(
tool
.
get
(
"name"
)
==
parsed_tool
.
function
.
name
):
self
.
prev_tool_call_arr
[
i
][
"arguments"
]
=
(
parsed_tool
.
function
.
arguments
)
if
tool
.
get
(
"name"
)
==
parsed_tool
.
function
.
name
:
args
=
parsed_tool
.
function
.
arguments
self
.
prev_tool_call_arr
[
i
][
"arguments"
]
=
args
break
except
Exception
:
pass
# Ignore parsing errors during streaming
...
...
@@ -530,73 +519,110 @@ class Qwen3CoderToolParser(ToolParser):
# Reset state for next tool
self
.
in_function
=
False
self
.
json_closed
=
True
self
.
accumulated_params
=
{}
return
result
# Look for parameters
# Count how many complete parameters we have processed
complete_params
=
tool_text
.
count
(
self
.
parameter_end_token
)
# Find all parameter starts
param_starts
=
[]
idx
=
0
while
True
:
idx
=
tool_text
.
find
(
self
.
parameter_prefix
,
idx
)
if
idx
==
-
1
:
break
param_starts
.
append
(
idx
)
idx
+=
len
(
self
.
parameter_prefix
)
# Check if we should start a new parameter
if
not
self
.
in_param
and
self
.
param_count
<
complete_params
:
# Find the unprocessed parameter
# Count parameter starts
param_starts
=
[]
idx
=
0
while
True
:
idx
=
tool_text
.
find
(
self
.
parameter_prefix
,
idx
)
if
idx
==
-
1
:
break
param_starts
.
append
(
idx
)
idx
+=
len
(
self
.
parameter_prefix
)
if
len
(
param_starts
)
>
self
.
param_count
:
# Process the next parameter
param_idx
=
param_starts
[
self
.
param_count
]
param_start
=
param_idx
+
len
(
self
.
parameter_prefix
)
remaining
=
tool_text
[
param_start
:]
if
">"
in
remaining
:
# We have the complete parameter name
name_end
=
remaining
.
find
(
">"
)
self
.
current_param_name
=
remaining
[:
name_end
]
# Find the parameter value
value_start
=
param_start
+
name_end
+
1
value_text
=
tool_text
[
value_start
:]
if
value_text
.
startswith
(
"
\n
"
):
value_text
=
value_text
[
1
:]
# Find where this parameter ends
param_end_idx
=
value_text
.
find
(
self
.
parameter_end_token
)
if
param_end_idx
!=
-
1
:
# Complete parameter found
param_value
=
value_text
[:
param_end_idx
]
if
param_value
.
endswith
(
"
\n
"
):
param_value
=
param_value
[:
-
1
]
# Build complete JSON fragment for this parameter
if
self
.
param_count
==
0
:
json_fragment
=
(
'"'
+
self
.
current_param_name
+
'": "'
+
json
.
dumps
(
param_value
)[
1
:
-
1
]
+
'"'
)
if
(
not
self
.
in_param
and
self
.
param_count
<
len
(
param_starts
)
and
len
(
param_starts
)
>
self
.
param_count
):
# Process the next parameter
param_idx
=
param_starts
[
self
.
param_count
]
param_start
=
param_idx
+
len
(
self
.
parameter_prefix
)
remaining
=
tool_text
[
param_start
:]
if
">"
in
remaining
:
# We have the complete parameter name
name_end
=
remaining
.
find
(
">"
)
self
.
current_param_name
=
remaining
[:
name_end
]
# Find the parameter value
value_start
=
param_start
+
name_end
+
1
value_text
=
tool_text
[
value_start
:]
if
value_text
.
startswith
(
"
\n
"
):
value_text
=
value_text
[
1
:]
# Find where this parameter ends
param_end_idx
=
value_text
.
find
(
self
.
parameter_end_token
)
if
param_end_idx
==
-
1
:
# No closing tag, look for next parameter or
# function end
next_param_idx
=
value_text
.
find
(
self
.
parameter_prefix
)
func_end_idx
=
value_text
.
find
(
self
.
function_end_token
)
if
next_param_idx
!=
-
1
and
(
func_end_idx
==
-
1
or
next_param_idx
<
func_end_idx
):
param_end_idx
=
next_param_idx
elif
func_end_idx
!=
-
1
:
param_end_idx
=
func_end_idx
else
:
# Neither found, check if tool call is complete
if
self
.
tool_call_end_token
in
tool_text
:
# Tool call is complete, so parameter
# must be complete too. Use all
# remaining text before function end
param_end_idx
=
len
(
value_text
)
else
:
json_fragment
=
(
', "'
+
self
.
current_param_name
+
'": "'
+
json
.
dumps
(
param_value
)[
1
:
-
1
]
+
'"'
)
self
.
param_count
+=
1
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
json_fragment
),
)
])
# Continue parameter value
# Still streaming, wait for more content
return
None
if
param_end_idx
!=
-
1
:
# Complete parameter found
param_value
=
value_text
[:
param_end_idx
]
if
param_value
.
endswith
(
"
\n
"
):
param_value
=
param_value
[:
-
1
]
# Store raw value for later processing
self
.
accumulated_params
[
self
.
current_param_name
]
=
param_value
# Get parameter configuration for type conversion
param_config
=
self
.
_get_arguments_config
(
self
.
current_function_name
or
""
,
self
.
streaming_request
.
tools
if
self
.
streaming_request
else
None
)
# Convert param value to appropriate type
converted_value
=
self
.
_convert_param_value
(
param_value
,
self
.
current_param_name
,
param_config
,
self
.
current_function_name
or
""
)
# Build JSON fragment based on the converted type
# Use json.dumps to properly serialize the value
serialized_value
=
json
.
dumps
(
converted_value
,
ensure_ascii
=
False
)
if
self
.
param_count
==
0
:
json_fragment
=
(
f
'"
{
self
.
current_param_name
}
": '
f
'
{
serialized_value
}
'
)
else
:
json_fragment
=
(
f
', "
{
self
.
current_param_name
}
": '
f
'
{
serialized_value
}
'
)
self
.
param_count
+=
1
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
json_fragment
),
)
])
# Continue parameter value - Not used in the current implementation
# since we process complete parameters above
if
self
.
in_param
:
if
self
.
parameter_end_token
in
delta_text
:
# End of parameter
...
...
@@ -608,25 +634,42 @@ class Qwen3CoderToolParser(ToolParser):
gt_idx
=
value_chunk
.
find
(
">"
)
value_chunk
=
value_chunk
[
gt_idx
+
1
:]
if
(
not
self
.
current_param_value
and
value_chunk
.
startswith
(
"
\n
"
)
)
:
if
not
self
.
current_param_value
and
value_chunk
.
startswith
(
"
\n
"
):
value_chunk
=
value_chunk
[
1
:]
#
Calculate incremental JSON
#
Store complete value
full_value
=
self
.
current_param_value
+
value_chunk
prev_escaped
=
(
json
.
dumps
(
self
.
current_param_value
)[
1
:
-
1
]
if
self
.
current_param_value
else
""
)
full_escaped
=
json
.
dumps
(
full_value
)[
1
:
-
1
]
delta_escaped
=
full_escaped
[
len
(
prev_escaped
):]
self
.
accumulated_params
[
self
.
current_param_name
]
=
full_value
# Get parameter configuration for type conversion
param_config
=
self
.
_get_arguments_config
(
self
.
current_function_name
or
""
,
self
.
streaming_request
.
tools
if
self
.
streaming_request
else
None
)
# Convert the parameter value to the appropriate type
converted_value
=
self
.
_convert_param_value
(
full_value
,
self
.
current_param_name
or
""
,
param_config
,
self
.
current_function_name
or
""
)
# Serialize the converted value
serialized_value
=
json
.
dumps
(
converted_value
,
ensure_ascii
=
False
)
# Since we've been streaming the quoted version,
# we need to close it properly
# This is complex - for now just complete the value
self
.
in_param
=
False
self
.
current_param_value
=
""
# Just close the current parameter string
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
delta_escaped
+
'"'
),
arguments
=
'"'
),
# Close the string quote
)
])
else
:
...
...
@@ -638,18 +681,18 @@ class Qwen3CoderToolParser(ToolParser):
gt_idx
=
value_chunk
.
find
(
">"
)
value_chunk
=
value_chunk
[
gt_idx
+
1
:]
if
(
not
self
.
current_param_value
and
value_chunk
.
startswith
(
"
\n
"
)
)
:
if
not
self
.
current_param_value
and
value_chunk
.
startswith
(
"
\n
"
):
value_chunk
=
value_chunk
[
1
:]
if
value_chunk
:
# Stream the escaped delta
prev_escaped
=
(
json
.
dumps
(
self
.
current_param_value
)[
1
:
-
1
]
if
self
.
current_param_value
else
""
)
prev_escaped
=
json
.
dumps
(
self
.
current_param_value
,
ensure_ascii
=
False
)[
1
:
-
1
]
if
self
.
current_param_value
else
""
self
.
current_param_value
+=
value_chunk
full_escaped
=
json
.
dumps
(
self
.
current_param_v
al
u
e
)[
1
:
-
1
]
full_escaped
=
json
.
dumps
(
self
.
current_param_value
,
ensure_ascii
=
F
al
s
e
)[
1
:
-
1
]
delta_escaped
=
full_escaped
[
len
(
prev_escaped
):]
if
delta_escaped
:
...
...
@@ -661,4 +704,4 @@ class Qwen3CoderToolParser(ToolParser):
)
])
return
None
return
None
\ No newline at end of file
vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from qwen3coder xml parser, All rights reserved.
# ruff: noqa: E501
import
ast
import
json
import
uuid
from
collections.abc
import
Sequence
from
typing
import
Any
,
Optional
,
Union
import
regex
as
re
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionToolsParam
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
ExtractedToolCallInformation
,
FunctionCall
,
ToolCall
)
from
vllm.entrypoints.openai.tool_parsers.abstract_tool_parser
import
(
ToolParser
,
ToolParserManager
)
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
logger
=
init_logger
(
__name__
)
@
ToolParserManager
.
register_module
(
"seed_oss"
)
class
SeedOssToolParser
(
ToolParser
):
TOOL_CALL_START
=
"<seed:tool_call>"
TOOL_CALL_END
=
"</seed:tool_call>"
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
):
super
().
__init__
(
tokenizer
)
# --- streaming state ---
self
.
_reset_streaming_state
()
self
.
prev_tool_call_arr
:
list
[
dict
]
=
[]
self
.
tool_call_start_token
:
str
=
self
.
TOOL_CALL_START
self
.
tool_call_end_token
:
str
=
self
.
TOOL_CALL_END
# Sentinel tokens for streaming mode
self
.
tool_call_prefix
:
str
=
"<function="
self
.
function_end_token
:
str
=
"</function>"
self
.
parameter_prefix
:
str
=
"<parameter="
self
.
parameter_end_token
:
str
=
"</parameter>"
self
.
think_start_token
:
str
=
"<seed:think>"
self
.
think_end_token
:
str
=
"</seed:think>"
self
.
is_tool_call_started
:
bool
=
False
self
.
is_thinking_end
:
bool
=
False
self
.
failed_count
:
int
=
0
self
.
_reset_streaming_state
()
self
.
tool_call_start_token_id
=
self
.
vocab
.
get
(
self
.
tool_call_start_token
)
self
.
tool_call_end_token_id
=
self
.
vocab
.
get
(
self
.
tool_call_end_token
)
self
.
think_end_token_id
=
self
.
vocab
.
get
(
self
.
think_end_token
)
if
(
self
.
tool_call_start_token_id
is
None
or
self
.
tool_call_end_token_id
is
None
):
raise
RuntimeError
(
"Seed_Oss XML parser: tokenizer did not include "
"<seed:tool_call> or its closing tag."
)
tool_start_re
=
re
.
escape
(
self
.
tool_call_start_token
)
tool_end_re
=
re
.
escape
(
self
.
tool_call_end_token
)
self
.
tool_call_complete_regex
=
re
.
compile
(
rf
"
{
tool_start_re
}
(.*?)
{
tool_end_re
}
"
,
re
.
DOTALL
)
self
.
tool_call_regex
=
re
.
compile
(
rf
"
{
tool_start_re
}
(.*?)
{
tool_end_re
}
|
{
tool_start_re
}
(.*?)$"
,
re
.
DOTALL
)
self
.
tool_call_function_regex
=
re
.
compile
(
r
"<function=(.*?)</function>|<function=(.*)$"
,
re
.
DOTALL
)
self
.
tool_call_parameter_regex
=
re
.
compile
(
r
"<parameter=(.*?)</parameter>|<parameter=(.*?)$"
,
re
.
DOTALL
)
logger
.
info
(
"vLLM Seed-Oss XML tool parser loaded (%s)."
,
self
.
__class__
.
__name__
)
def
_generate_tool_call_id
(
self
)
->
str
:
"""Generate a unique tool call ID."""
return
f
"call_
{
uuid
.
uuid4
().
hex
[:
24
]
}
"
def
_reset_streaming_state
(
self
):
"""Reset all streaming state."""
self
.
current_tool_index
=
0
self
.
is_tool_call_started
=
False
self
.
header_sent
=
False
self
.
current_tool_id
=
-
1
self
.
current_function_name
=
None
self
.
current_param_name
=
None
self
.
current_param_value
=
""
self
.
param_count
=
0
self
.
in_param
=
False
self
.
in_function
=
False
self
.
accumulated_text
=
""
self
.
json_started
=
False
self
.
json_closed
=
False
def
_parse_xml_function_call
(
self
,
function_call_str
:
str
,
tools
:
Optional
[
list
[
ChatCompletionToolsParam
]]
)
->
Optional
[
ToolCall
]:
def
get_arguments_config
(
func_name
:
str
)
->
dict
:
if
tools
is
None
:
return
{}
for
config
in
tools
:
if
not
hasattr
(
config
,
"type"
)
or
not
(
hasattr
(
config
,
"function"
)
and
hasattr
(
config
.
function
,
"name"
)):
continue
if
(
config
.
type
==
"function"
and
config
.
function
.
name
==
func_name
):
if
not
hasattr
(
config
.
function
,
"parameters"
):
return
{}
params
=
config
.
function
.
parameters
if
isinstance
(
params
,
dict
)
and
"properties"
in
params
:
return
params
[
"properties"
]
elif
isinstance
(
params
,
dict
):
return
params
else
:
return
{}
logger
.
warning
(
"Tool '%s' is not defined in the tools list."
,
func_name
)
return
{}
def
convert_param_value
(
param_value
:
str
,
param_name
:
str
,
param_config
:
dict
,
func_name
:
str
)
->
Any
:
# Handle null value for any type
if
param_value
.
lower
()
==
"null"
:
return
None
if
param_name
not
in
param_config
:
if
param_config
!=
{}:
logger
.
warning
(
"Parsed parameter '%s' is not defined in "
"the tool parameters for tool '%s', "
"directly returning the string value."
,
param_name
,
func_name
)
return
param_value
if
(
isinstance
(
param_config
[
param_name
],
dict
)
and
"type"
in
param_config
[
param_name
]):
param_type
=
str
(
param_config
[
param_name
][
"type"
]).
strip
().
lower
()
else
:
param_type
=
"string"
if
param_type
in
[
"string"
,
"str"
,
"text"
,
"varchar"
,
"char"
,
"enum"
]:
return
param_value
elif
(
param_type
.
startswith
(
"int"
)
or
param_type
.
startswith
(
"uint"
)
or
param_type
.
startswith
(
"long"
)
or
param_type
.
startswith
(
"short"
)
or
param_type
.
startswith
(
"unsigned"
)):
try
:
param_value
=
int
(
param_value
)
# type: ignore
except
(
ValueError
,
TypeError
):
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not an integer in tool "
"'%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
elif
param_type
.
startswith
(
"num"
)
or
param_type
.
startswith
(
"float"
):
try
:
float_param_value
=
float
(
param_value
)
param_value
=
float_param_value
if
float_param_value
-
int
(
float_param_value
)
!=
0
else
int
(
float_param_value
)
# type: ignore
except
(
ValueError
,
TypeError
):
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a float in tool "
"'%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
elif
param_type
in
[
"boolean"
,
"bool"
,
"binary"
]:
param_value
=
param_value
.
lower
()
if
param_value
not
in
[
"true"
,
"false"
]:
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a boolean "
"(`true` of `false`) in tool '%s', degenerating to false."
,
param_value
,
param_name
,
func_name
)
return
param_value
==
"true"
else
:
if
param_type
==
"object"
or
param_type
.
startswith
(
"dict"
):
try
:
param_value
=
json
.
loads
(
param_value
)
return
param_value
except
(
ValueError
,
TypeError
,
json
.
JSONDecodeError
):
logger
.
warning
(
"Parsed value '%s' of parameter '%s' is not a valid JSON "
"object in tool '%s', will try other methods to parse it."
,
param_value
,
param_name
,
func_name
)
try
:
param_value
=
ast
.
literal_eval
(
param_value
)
except
(
ValueError
,
SyntaxError
):
logger
.
warning
(
"Parsed value '%s' of parameter '%s' cannot be converted via "
"Python `ast.literal_eval()` in tool '%s', degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
# Extract function name
end_index
=
function_call_str
.
index
(
">"
)
function_name
=
function_call_str
[:
end_index
]
param_config
=
get_arguments_config
(
function_name
)
parameters
=
function_call_str
[
end_index
+
1
:]
param_dict
=
{}
for
match
in
self
.
tool_call_parameter_regex
.
findall
(
parameters
):
match_text
=
match
[
0
]
if
match
[
0
]
else
match
[
1
]
idx
=
match_text
.
index
(
">"
)
param_name
=
match_text
[:
idx
]
param_value
=
str
(
match_text
[
idx
+
1
:])
# Remove prefix and trailing \n
if
param_value
.
startswith
(
"
\n
"
):
param_value
=
param_value
[
1
:]
if
param_value
.
endswith
(
"
\n
"
):
param_value
=
param_value
[:
-
1
]
param_dict
[
param_name
]
=
convert_param_value
(
param_value
,
param_name
,
param_config
,
function_name
)
return
ToolCall
(
type
=
"function"
,
function
=
FunctionCall
(
name
=
function_name
,
arguments
=
json
.
dumps
(
param_dict
,
ensure_ascii
=
False
)),
)
def
_get_function_calls
(
self
,
model_output
:
str
)
->
list
[
str
]:
# Find all tool calls
matched_ranges
=
self
.
tool_call_regex
.
findall
(
model_output
)
raw_tool_calls
=
[
match
[
0
]
if
match
[
0
]
else
match
[
1
]
for
match
in
matched_ranges
]
# Back-off strategy if no tool_call tags found
if
len
(
raw_tool_calls
)
==
0
:
raw_tool_calls
=
[
model_output
]
raw_function_calls
=
[]
for
tool_call
in
raw_tool_calls
:
raw_function_calls
.
extend
(
self
.
tool_call_function_regex
.
findall
(
tool_call
))
function_calls
=
[
match
[
0
]
if
match
[
0
]
else
match
[
1
]
for
match
in
raw_function_calls
]
return
function_calls
def
extract_tool_calls
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
,
)
->
ExtractedToolCallInformation
:
# Quick check to avoid unnecessary processing
if
self
.
tool_call_prefix
not
in
model_output
:
return
ExtractedToolCallInformation
(
tools_called
=
False
,
tool_calls
=
[],
content
=
model_output
)
# Check if both think start and end tokens are present
if
(
self
.
think_start_token
in
model_output
and
self
.
think_end_token
in
model_output
):
# Find the position of think end token
think_end_index
=
model_output
.
find
(
self
.
think_end_token
)
+
len
(
self
.
think_end_token
)
# Extract content after think end token
result_content
=
model_output
[
think_end_index
:]
thinking_content
=
model_output
[:
think_end_index
]
else
:
thinking_content
=
""
result_content
=
model_output
try
:
function_calls
=
self
.
_get_function_calls
(
result_content
)
if
len
(
function_calls
)
==
0
:
return
ExtractedToolCallInformation
(
tools_called
=
False
,
tool_calls
=
[],
content
=
model_output
)
tool_calls
=
[
self
.
_parse_xml_function_call
(
function_call_str
,
request
.
tools
)
for
function_call_str
in
function_calls
]
# Populate prev_tool_call_arr for serving layer to set finish_reason
self
.
prev_tool_call_arr
.
clear
()
# Clear previous calls
for
tool_call
in
tool_calls
:
if
tool_call
:
self
.
prev_tool_call_arr
.
append
({
"name"
:
tool_call
.
function
.
name
,
"arguments"
:
tool_call
.
function
.
arguments
,
})
# Extract content before tool calls
tool_call_start_index
=
result_content
.
find
(
self
.
tool_call_start_token
)
tool_call_start_index
=
(
tool_call_start_index
if
tool_call_start_index
>=
0
else
result_content
.
find
(
self
.
tool_call_prefix
))
content
=
thinking_content
+
result_content
[:
tool_call_start_index
]
return
ExtractedToolCallInformation
(
tools_called
=
(
len
(
tool_calls
)
>
0
),
tool_calls
=
tool_calls
,
content
=
content
if
content
else
None
,
)
except
Exception
:
logger
.
exception
(
"Error in extracting tool call from response."
)
return
ExtractedToolCallInformation
(
tools_called
=
False
,
tool_calls
=
[],
content
=
model_output
)
def
extract_tool_calls_streaming
(
self
,
previous_text
:
str
,
current_text
:
str
,
delta_text
:
str
,
previous_token_ids
:
Sequence
[
int
],
current_token_ids
:
Sequence
[
int
],
delta_token_ids
:
Sequence
[
int
],
request
:
ChatCompletionRequest
,
)
->
Union
[
DeltaMessage
,
None
]:
# If no delta text, return None unless
# it's an EOS token after tool calls
if
not
delta_text
:
# Check if this is an EOS token after all tool calls are complete
# We check for tool calls in the text even if is_tool_call_started
# is False because it might have been reset after processing all tools
if
(
delta_token_ids
and
self
.
tool_call_end_token_id
not
in
delta_token_ids
):
# Count complete tool calls
complete_calls
=
len
(
self
.
tool_call_complete_regex
.
findall
(
current_text
))
# If we have completed tool calls and populated prev_tool_call_arr
if
complete_calls
>
0
and
len
(
self
.
prev_tool_call_arr
)
>
0
:
# Check if all tool calls are closed
open_calls
=
current_text
.
count
(
self
.
tool_call_start_token
)
-
current_text
.
count
(
self
.
tool_call_end_token
)
if
open_calls
==
0
:
# Return empty delta message to allow finish_reason processing
return
DeltaMessage
(
content
=
""
)
elif
not
self
.
is_tool_call_started
and
current_text
:
# This is a regular content response that's now complete
return
DeltaMessage
(
content
=
""
)
return
None
# Check if this is the first call (reset state if needed)
if
not
previous_text
:
self
.
_reset_streaming_state
()
# Update accumulated text
self
.
accumulated_text
=
current_text
# Check if we need to advance to next tool
if
self
.
json_closed
and
not
self
.
in_function
:
# Check if this tool call has ended
tool_ends
=
current_text
.
count
(
self
.
tool_call_end_token
)
if
tool_ends
>
self
.
current_tool_index
:
# This tool has ended, advance to next
self
.
current_tool_index
+=
1
self
.
header_sent
=
False
self
.
param_count
=
0
self
.
json_started
=
False
self
.
json_closed
=
False
# Check if there are more tool calls
if
self
.
current_tool_index
>=
current_text
.
count
(
self
.
tool_call_start_token
):
# No more tool calls
self
.
is_tool_call_started
=
False
# Continue processing next tool
return
None
# Check if end thinking
if
(
not
self
.
is_thinking_end
and
(
self
.
think_end_token_id
in
delta_token_ids
or
self
.
think_end_token
in
delta_text
)):
self
.
is_thinking_end
=
True
# If thinking hasn't ended yet, don't process any tool calls
if
not
self
.
is_thinking_end
:
return
DeltaMessage
(
content
=
delta_text
)
# Handle normal content before tool calls
if
not
self
.
is_tool_call_started
:
# Check if tool call is starting
if
(
self
.
tool_call_start_token_id
in
delta_token_ids
or
self
.
tool_call_start_token
in
delta_text
):
self
.
is_tool_call_started
=
True
# Return any content before the tool call
if
self
.
tool_call_start_token
in
delta_text
:
content_before
=
delta_text
[:
delta_text
.
index
(
self
.
tool_call_start_token
)]
if
content_before
:
return
DeltaMessage
(
content
=
content_before
)
return
None
else
:
# Check if we're between tool calls - skip whitespace
if
(
current_text
.
rstrip
().
endswith
(
self
.
tool_call_end_token
)
and
delta_text
.
strip
()
==
""
):
# We just ended a tool call, skip whitespace
return
None
# Normal content, no tool call
return
DeltaMessage
(
content
=
delta_text
)
# Check if we're between tool calls (waiting for next one)
# Count tool calls we've seen vs processed
tool_starts_count
=
current_text
.
count
(
self
.
tool_call_start_token
)
if
self
.
current_tool_index
>=
tool_starts_count
:
# We're past all tool calls, shouldn't be here
return
None
# We're in a tool call, find the current tool call portion
# Need to find the correct tool call based on current_tool_index
# Only process tool calls after think_end_token
think_end_index
=
current_text
.
find
(
self
.
think_end_token
)
+
len
(
self
.
think_end_token
)
if
self
.
think_end_token
in
current_text
else
0
tool_starts
:
list
[
int
]
=
[]
idx
=
think_end_index
while
True
:
idx
=
current_text
.
find
(
self
.
tool_call_start_token
,
idx
)
if
idx
==
-
1
:
break
tool_starts
.
append
(
idx
)
idx
+=
len
(
self
.
tool_call_start_token
)
if
self
.
current_tool_index
>=
len
(
tool_starts
):
# No more tool calls to process yet
return
None
tool_start_idx
=
tool_starts
[
self
.
current_tool_index
]
# Find where this tool call ends (or current position if not ended yet)
tool_end_idx
=
current_text
.
find
(
self
.
tool_call_end_token
,
tool_start_idx
)
if
tool_end_idx
==
-
1
:
tool_text
=
current_text
[
tool_start_idx
:]
else
:
tool_text
=
current_text
[
tool_start_idx
:
tool_end_idx
+
len
(
self
.
tool_call_end_token
)]
# Looking for function header
if
not
self
.
header_sent
:
if
self
.
tool_call_prefix
in
tool_text
:
func_start
=
tool_text
.
find
(
self
.
tool_call_prefix
)
+
len
(
self
.
tool_call_prefix
)
func_end
=
tool_text
.
find
(
">"
,
func_start
)
if
func_end
!=
-
1
:
# Found complete function name
self
.
current_function_name
=
tool_text
[
func_start
:
func_end
]
self
.
current_tool_id
=
self
.
_generate_tool_call_id
(
)
# type: ignore
self
.
header_sent
=
True
self
.
in_function
=
True
# IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
# This ensures finish_reason="tool_calls" even if parsing isn't complete
already_added
=
any
(
tool
.
get
(
"name"
)
==
self
.
current_function_name
for
tool
in
self
.
prev_tool_call_arr
)
if
not
already_added
:
self
.
prev_tool_call_arr
.
append
({
"name"
:
self
.
current_function_name
,
"arguments"
:
"{}"
,
# Placeholder, will be updated later
})
# Send header with function info
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
id
=
self
.
current_tool_id
,
function
=
DeltaFunctionCall
(
name
=
self
.
current_function_name
,
arguments
=
""
),
type
=
"function"
,
)
])
return
None
# We've sent header, now handle function body
if
self
.
in_function
:
# Send opening brace if not sent yet
if
(
not
self
.
json_started
and
self
.
parameter_prefix
not
in
delta_text
):
self
.
json_started
=
True
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
"{"
),
)
])
# Make sure json_started is set if we're processing parameters
if
not
self
.
json_started
:
self
.
json_started
=
True
# Check for function end in accumulated text
if
not
self
.
json_closed
and
self
.
function_end_token
in
tool_text
:
# Close JSON
self
.
json_closed
=
True
# Extract the complete tool call to update prev_tool_call_arr with final arguments
# Find the function content
func_start
=
tool_text
.
find
(
self
.
tool_call_prefix
)
+
len
(
self
.
tool_call_prefix
)
func_content_end
=
tool_text
.
find
(
self
.
function_end_token
,
func_start
)
if
func_content_end
!=
-
1
:
func_content
=
tool_text
[
func_start
:
func_content_end
]
# Parse to get the complete arguments
try
:
parsed_tool
=
self
.
_parse_xml_function_call
(
func_content
,
request
.
tools
if
request
else
None
)
if
parsed_tool
:
# Update existing entry in prev_tool_call_arr with complete arguments
for
i
,
tool
in
enumerate
(
self
.
prev_tool_call_arr
):
if
tool
.
get
(
"name"
)
==
parsed_tool
.
function
.
name
:
self
.
prev_tool_call_arr
[
i
][
"arguments"
]
=
(
parsed_tool
.
function
.
arguments
)
break
except
Exception
:
logger
.
warning
(
"Failed to parse tool arguments during streaming."
,
exc_info
=
True
)
result
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
"}"
),
)
])
# Reset state for next tool
self
.
in_function
=
False
self
.
json_closed
=
True
return
result
# Look for parameters
# Count how many complete parameters we have processed
complete_params
=
tool_text
.
count
(
self
.
parameter_end_token
)
# Check if we should start a new parameter
if
not
self
.
in_param
and
self
.
param_count
<
complete_params
:
# Find the unprocessed parameter
# Count parameter starts
param_starts
=
[]
idx
=
0
while
True
:
idx
=
tool_text
.
find
(
self
.
parameter_prefix
,
idx
)
if
idx
==
-
1
:
break
param_starts
.
append
(
idx
)
idx
+=
len
(
self
.
parameter_prefix
)
if
len
(
param_starts
)
>
self
.
param_count
:
# Process the next parameter
param_idx
=
param_starts
[
self
.
param_count
]
param_start
=
param_idx
+
len
(
self
.
parameter_prefix
)
remaining
=
tool_text
[
param_start
:]
if
">"
in
remaining
:
# We have the complete parameter name
name_end
=
remaining
.
find
(
">"
)
self
.
current_param_name
=
remaining
[:
name_end
]
# Find the parameter value
value_start
=
param_start
+
name_end
+
1
value_text
=
tool_text
[
value_start
:]
if
value_text
.
startswith
(
"
\n
"
):
value_text
=
value_text
[
1
:]
# Find where this parameter ends
param_end_idx
=
value_text
.
find
(
self
.
parameter_end_token
)
if
param_end_idx
!=
-
1
:
# Complete parameter found
param_value
=
value_text
[:
param_end_idx
]
if
param_value
.
endswith
(
"
\n
"
):
param_value
=
param_value
[:
-
1
]
# Build complete JSON fragment for this parameter
if
self
.
param_count
==
0
:
json_fragment
=
(
'"'
+
self
.
current_param_name
+
'": "'
+
json
.
dumps
(
param_value
)[
1
:
-
1
]
+
'"'
)
else
:
json_fragment
=
(
', "'
+
self
.
current_param_name
+
'": "'
+
json
.
dumps
(
param_value
)[
1
:
-
1
]
+
'"'
)
self
.
param_count
+=
1
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
json_fragment
),
)
])
# Continue parameter value
if
self
.
in_param
:
if
self
.
parameter_end_token
in
delta_text
:
# End of parameter
end_idx
=
delta_text
.
find
(
self
.
parameter_end_token
)
value_chunk
=
delta_text
[:
end_idx
]
# Skip past > if at start
if
not
self
.
current_param_value
and
">"
in
value_chunk
:
gt_idx
=
value_chunk
.
find
(
">"
)
value_chunk
=
value_chunk
[
gt_idx
+
1
:]
if
not
self
.
current_param_value
and
value_chunk
.
startswith
(
"
\n
"
):
value_chunk
=
value_chunk
[
1
:]
# Calculate incremental JSON
full_value
=
self
.
current_param_value
+
value_chunk
prev_escaped
=
(
json
.
dumps
(
self
.
current_param_value
)[
1
:
-
1
]
if
self
.
current_param_value
else
""
)
full_escaped
=
json
.
dumps
(
full_value
)[
1
:
-
1
]
delta_escaped
=
full_escaped
[
len
(
prev_escaped
):]
self
.
in_param
=
False
self
.
current_param_value
=
""
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
delta_escaped
+
'"'
),
)
])
else
:
# Continue accumulating value
value_chunk
=
delta_text
# Handle first chunk after param name
if
not
self
.
current_param_value
and
">"
in
value_chunk
:
gt_idx
=
value_chunk
.
find
(
">"
)
value_chunk
=
value_chunk
[
gt_idx
+
1
:]
if
not
self
.
current_param_value
and
value_chunk
.
startswith
(
"
\n
"
):
value_chunk
=
value_chunk
[
1
:]
if
value_chunk
:
# Stream the escaped delta
prev_escaped
=
(
json
.
dumps
(
self
.
current_param_value
)[
1
:
-
1
]
if
self
.
current_param_value
else
""
)
self
.
current_param_value
+=
value_chunk
full_escaped
=
json
.
dumps
(
self
.
current_param_value
)[
1
:
-
1
]
delta_escaped
=
full_escaped
[
len
(
prev_escaped
):]
if
delta_escaped
:
return
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
self
.
current_tool_index
,
function
=
DeltaFunctionCall
(
arguments
=
delta_escaped
),
)
])
return
None
vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
View file @
a99300bd
...
...
@@ -7,7 +7,7 @@ from typing import Any, Optional, Union
import
regex
as
re
from
vllm.entrypoints.chat_utils
import
random
_tool_call_id
from
vllm.entrypoints.chat_utils
import
make
_tool_call_id
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaFunctionCall
,
DeltaMessage
,
DeltaToolCall
,
...
...
@@ -186,11 +186,31 @@ class xLAMToolParser(ToolParser):
"""
Extract tool calls for streaming mode.
"""
# Simplify detection: if it begins with "[" treat it as a function call
is_function_call
=
(
current_text
.
strip
().
startswith
(
"["
))
# If not a function call, return normal content
if
not
is_function_call
:
# First, check for a definitive start of a tool call block.
# This prevents premature parsing of incomplete output.
stripped_text
=
current_text
.
strip
()
preprocessed_content
,
preprocessed_tool_calls
=
(
self
.
preprocess_model_output
(
current_text
))
# For JSON code blocks, we need to detect them earlier, even if incomplete
has_potential_json_block
=
(
"```json"
in
current_text
or
"```
\n
["
in
current_text
or
"[TOOL_CALLS]"
in
current_text
or
"<tool_call>"
in
current_text
)
is_tool_call_block
=
(
stripped_text
.
startswith
(
"["
)
or
stripped_text
.
startswith
(
"<tool_call>"
)
or
stripped_text
.
startswith
(
"[TOOL_CALLS]"
)
or
# Check if we have thinking tags with JSON-like content following
(
"</think>["
in
current_text
)
or
# Check if the text contains a JSON array after preprocessing
preprocessed_tool_calls
is
not
None
or
# For JSON code blocks, detect early if we see enough structure
(
has_potential_json_block
and
'"name"'
in
current_text
and
'"arguments"'
in
current_text
))
if
not
is_tool_call_block
:
return
DeltaMessage
(
content
=
delta_text
)
try
:
...
...
@@ -204,7 +224,10 @@ class xLAMToolParser(ToolParser):
# Try parsing as JSON to check for complete tool calls
try
:
parsed_tools
=
json
.
loads
(
current_text
)
# Use preprocessed tool calls if available
tool_calls_text
=
(
preprocessed_tool_calls
if
preprocessed_tool_calls
else
current_text
)
parsed_tools
=
json
.
loads
(
tool_calls_text
)
if
isinstance
(
parsed_tools
,
list
):
# Update our tool array for next time
self
.
prev_tool_call_arr
=
parsed_tools
...
...
@@ -226,7 +249,7 @@ class xLAMToolParser(ToolParser):
function_name
=
name_match
.
group
(
1
)
# The test expects us to send just the name first
tool_id
=
random
_tool_call_id
()
tool_id
=
make
_tool_call_id
()
delta
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
0
,
...
...
@@ -257,13 +280,40 @@ class xLAMToolParser(ToolParser):
return
delta
# Use regex to identify tool calls in the output
# Use preprocessed tool calls text for better parsing, but also try to extract from incomplete JSON blocks
search_text
=
(
preprocessed_tool_calls
if
preprocessed_tool_calls
else
current_text
)
# For JSON code blocks that aren't complete yet, try to extract the JSON content
if
not
preprocessed_tool_calls
and
has_potential_json_block
:
# Try to extract the JSON array from within the code block
json_match
=
re
.
search
(
r
"```(?:json)?\s*([\s\S]*?)(?:```|$)"
,
current_text
)
if
json_match
:
potential_json
=
json_match
.
group
(
1
).
strip
()
# Use this as search text even if it's incomplete
if
potential_json
.
startswith
(
"["
)
and
(
'"name"'
in
potential_json
and
'"arguments"'
in
potential_json
):
search_text
=
potential_json
# Try to find complete tool names first
name_pattern
=
r
'"name"\s*:\s*"([^"]+)"'
name_matches
=
list
(
re
.
finditer
(
name_pattern
,
current
_text
))
name_matches
=
list
(
re
.
finditer
(
name_pattern
,
search
_text
))
tool_count
=
len
(
name_matches
)
# If no
tools found yet, return
# If no
complete tool names found, check for partial tool names
if
tool_count
==
0
:
return
None
# Check if we're in the middle of parsing a tool name
partial_name_pattern
=
r
'"name"\s*:\s*"([^"]*)'
partial_matches
=
list
(
re
.
finditer
(
partial_name_pattern
,
search_text
))
if
partial_matches
:
# We have a partial tool name - not ready to emit yet
return
None
else
:
# No tools found at all
return
None
# Ensure our state arrays are large enough
while
len
(
self
.
streaming_state
[
"sent_tools"
])
<
tool_count
:
...
...
@@ -332,7 +382,7 @@ class xLAMToolParser(ToolParser):
# First, check for the empty arguments case: "arguments": {}
empty_args_pattern
=
(
r
'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}'
)
empty_args_match
=
re
.
search
(
empty_args_pattern
,
current
_text
)
empty_args_match
=
re
.
search
(
empty_args_pattern
,
search
_text
)
# Check if this tool has empty arguments
if
empty_args_match
and
empty_args_match
.
start
()
>
0
:
...
...
@@ -376,7 +426,7 @@ class xLAMToolParser(ToolParser):
# Extract arguments for current tool using regex for non-empty arguments
args_pattern
=
r
'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
args_matches
=
list
(
re
.
finditer
(
args_pattern
,
current
_text
))
args_matches
=
list
(
re
.
finditer
(
args_pattern
,
search
_text
))
if
current_idx
<
len
(
args_matches
):
args_text
=
args_matches
[
current_idx
].
group
(
1
)
...
...
@@ -384,17 +434,25 @@ class xLAMToolParser(ToolParser):
# Handle transition between tools
is_last_tool
=
current_idx
==
tool_count
-
1
# Find where the arguments for our current tool end
if
not
is_last_tool
:
# If we have more tools after this one, try to find the complete argument block
next_tool_pos
=
current_text
.
find
(
"},{"
,
args_matches
[
current_idx
].
start
())
if
next_tool_pos
!=
-
1
:
args_end_pos
=
(
next_tool_pos
+
1
)
# +1 to include the '}'
args_text
=
(
current_text
[
args_matches
[
current_idx
]
.
start
():
args_end_pos
].
split
(
'"arguments":'
)[
1
].
strip
())
# For multiple tools, extract only the arguments for the current tool
if
tool_count
>
1
:
# Parse the entire JSON structure to properly extract arguments for each tool
try
:
parsed_tools
=
json
.
loads
(
search_text
)
if
isinstance
(
parsed_tools
,
list
)
and
current_idx
<
len
(
parsed_tools
):
current_tool
=
parsed_tools
[
current_idx
]
if
isinstance
(
current_tool
.
get
(
"arguments"
),
dict
):
args_text
=
json
.
dumps
(
current_tool
[
"arguments"
])
else
:
args_text
=
str
(
current_tool
.
get
(
"arguments"
,
"{}"
))
except
(
json
.
JSONDecodeError
,
KeyError
,
IndexError
):
# Fallback to regex-based extraction
pass
# If arguments haven't been sent yet
sent_args
=
self
.
streaming_state
[
"sent_tools"
][
...
...
@@ -419,7 +477,7 @@ class xLAMToolParser(ToolParser):
index
=
current_idx
,
function
=
DeltaFunctionCall
(
arguments
=
"{"
).
model_dump
(
exclude_none
=
True
),
# type: ignore
exclude_none
=
True
),
# type: ignore
)
])
return
delta
...
...
vllm/entrypoints/utils.py
View file @
a99300bd
...
...
@@ -313,12 +313,14 @@ def log_non_default_args(args: Union[argparse.Namespace, EngineArgs]):
# Handle EngineArgs instance
elif
isinstance
(
args
,
EngineArgs
):
default_args
=
EngineArgs
()
# Create default instance
default_args
=
EngineArgs
(
model
=
args
.
model
)
# Create default instance
for
field
in
dataclasses
.
fields
(
args
):
current_val
=
getattr
(
args
,
field
.
name
)
default_val
=
getattr
(
default_args
,
field
.
name
)
if
current_val
!=
default_val
:
non_default_args
[
field
.
name
]
=
current_val
if
default_args
.
model
!=
EngineArgs
.
model
:
non_default_args
[
"model"
]
=
default_args
.
model
else
:
raise
TypeError
(
"Unsupported argument type. "
\
"Must be argparse.Namespace or EngineArgs instance."
)
...
...
Prev
1
…
19
20
21
22
23
24
25
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment