Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1385 additions
and
560 deletions
+1385
-560
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+104
-34
vllm/entrypoints/openai/serving_classification.py
vllm/entrypoints/openai/serving_classification.py
+12
-9
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+39
-29
vllm/entrypoints/openai/serving_embedding.py
vllm/entrypoints/openai/serving_embedding.py
+28
-42
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+127
-273
vllm/entrypoints/openai/serving_pooling.py
vllm/entrypoints/openai/serving_pooling.py
+18
-15
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+522
-113
vllm/entrypoints/openai/serving_score.py
vllm/entrypoints/openai/serving_score.py
+2
-2
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_tokenization.py
+12
-10
vllm/entrypoints/openai/speech_to_text.py
vllm/entrypoints/openai/speech_to_text.py
+6
-1
vllm/entrypoints/openai/tool_parsers/__init__.py
vllm/entrypoints/openai/tool_parsers/__init__.py
+2
-0
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
.../entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+4
-4
vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
...points/openai/tool_parsers/llama4_pythonic_tool_parser.py
+1
-1
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+1
-1
vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+73
-0
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+1
-1
vllm/entrypoints/renderer.py
vllm/entrypoints/renderer.py
+395
-0
vllm/entrypoints/tool.py
vllm/entrypoints/tool.py
+24
-0
vllm/entrypoints/tool_server.py
vllm/entrypoints/tool_server.py
+14
-7
vllm/env_override.py
vllm/env_override.py
+0
-18
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
vllm/entrypoints/openai/serving_chat.py
View file @
38d80967
...
...
@@ -43,10 +43,10 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.sequence
import
Logprob
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
MistralTokenizer
from
vllm.transformers_utils.tokenizers
import
(
maybe_serialize_tool_calls
,
truncate_tool_call_ids
,
...
...
@@ -186,7 +186,7 @@ class OpenAIServingChat(OpenAIServing):
lora_request
=
self
.
_maybe_get_adapters
(
request
,
supports_default_mm_loras
=
True
)
model_name
=
self
.
_get_
model_name
(
request
.
model
,
lora_request
)
model_name
=
self
.
models
.
model_name
(
lora_request
)
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
lora_request
)
...
...
@@ -489,6 +489,8 @@ class OpenAIServingChat(OpenAIServing):
get_streamable_parser_for_assistant
()
for
_
in
range
(
num_choices
)
]
harmony_tools_streamed
=
[
False
]
*
num_choices
tools_streamed
=
[
False
]
*
num_choices
if
isinstance
(
request
.
tool_choice
,
ChatCompletionNamedToolChoiceParam
):
tool_choice_function_name
=
request
.
tool_choice
.
function
.
name
...
...
@@ -662,13 +664,11 @@ class OpenAIServingChat(OpenAIServing):
if
self
.
use_harmony
:
harmony_parser
=
harmony_parsers
[
i
]
prev_recipient
=
harmony_parser
.
current_recipient
for
token_id
in
output
.
token_ids
:
harmony_parser
.
process
(
token_id
)
is_reasoning
=
\
harmony_parser
.
current_channel
==
"analysis"
if
not
request
.
include_reasoning
and
is_reasoning
:
# Skip the reasoning content.
continue
cur_channel
=
harmony_parser
.
current_channel
cur_recipient
=
harmony_parser
.
current_recipient
delta_text
=
harmony_parser
.
last_content_delta
or
""
else
:
delta_text
=
output
.
text
...
...
@@ -681,8 +681,7 @@ class OpenAIServingChat(OpenAIServing):
delta_message
:
Optional
[
DeltaMessage
]
# just update previous_texts and previous_token_ids
if
((
tool_choice_auto
or
self
.
reasoning_parser
)
and
not
self
.
use_harmony
):
if
tool_choice_auto
or
self
.
reasoning_parser
:
assert
previous_texts
is
not
None
assert
all_previous_token_ids
is
not
None
previous_text
=
previous_texts
[
i
]
...
...
@@ -696,11 +695,54 @@ class OpenAIServingChat(OpenAIServing):
current_token_ids
=
as_list
(
output
.
token_ids
)
if
self
.
use_harmony
:
if
is_reasoning
:
delta_message
=
DeltaMessage
(
reasoning_content
=
delta_text
)
else
:
if
cur_channel
==
"final"
:
delta_message
=
DeltaMessage
(
content
=
delta_text
)
elif
cur_channel
==
"analysis"
:
if
request
.
include_reasoning
:
delta_message
=
DeltaMessage
(
reasoning_content
=
delta_text
)
else
:
delta_message
=
None
elif
(
cur_channel
==
"commentary"
and
cur_recipient
and
cur_recipient
.
startswith
(
"functions."
)):
# Count completed tool calls to determine index
base_index
=
0
for
msg
in
harmony_parser
.
messages
:
if
(
msg
.
channel
==
"commentary"
and
msg
.
recipient
and
msg
.
recipient
.
startswith
(
"functions."
)):
base_index
+=
1
if
prev_recipient
!=
cur_recipient
:
tool_name
=
cur_recipient
.
split
(
"functions."
,
1
)[
1
]
delta_message
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
id
=
make_tool_call_id
(),
type
=
"function"
,
function
=
DeltaFunctionCall
(
name
=
tool_name
,
arguments
=
""
,
),
index
=
base_index
,
)
])
elif
delta_text
:
delta_message
=
DeltaMessage
(
tool_calls
=
[
DeltaToolCall
(
index
=
base_index
,
function
=
DeltaFunctionCall
(
arguments
=
delta_text
),
)
])
else
:
delta_message
=
None
if
delta_message
is
not
None
:
harmony_tools_streamed
[
i
]
=
True
else
:
delta_message
=
None
# handle streaming deltas for tools with named tool_choice
elif
tool_choice_function_name
:
if
(
self
.
reasoning_parser
and
not
reasoning_end_arr
[
i
]
...
...
@@ -758,6 +800,7 @@ class OpenAIServingChat(OpenAIServing):
delta_message
=
DeltaMessage
(
tool_calls
=
[
delta_tool_call
,
])
tools_streamed
[
i
]
=
True
elif
request
.
tool_choice
==
"required"
:
assert
previous_texts
is
not
None
...
...
@@ -783,6 +826,7 @@ class OpenAIServingChat(OpenAIServing):
if
(
delta_message
and
delta_message
.
tool_calls
and
delta_message
.
tool_calls
[
0
].
id
is
not
None
):
history_tool_call_cnt
+=
1
tools_streamed
[
i
]
=
True
# update the previous values for the next iteration
previous_texts
[
i
]
=
current_text
...
...
@@ -859,6 +903,8 @@ class OpenAIServingChat(OpenAIServing):
current_token_ids
=
current_token_ids
,
delta_token_ids
=
delta_token_ids
,
request
=
request
))
if
delta_message
and
delta_message
.
tool_calls
:
tools_streamed
[
i
]
=
True
# when only tool calls
elif
tool_choice_auto
:
assert
tool_parser
is
not
None
...
...
@@ -871,6 +917,8 @@ class OpenAIServingChat(OpenAIServing):
current_token_ids
=
current_token_ids
,
delta_token_ids
=
output
.
token_ids
,
request
=
request
))
if
delta_message
and
delta_message
.
tool_calls
:
tools_streamed
[
i
]
=
True
# when only reasoning
elif
self
.
reasoning_parser
:
...
...
@@ -907,7 +955,10 @@ class OpenAIServingChat(OpenAIServing):
# wasn't ready to send a token, then
# get the next token without streaming a chunk
if
delta_message
is
None
:
continue
if
output
.
finish_reason
is
None
:
continue
else
:
delta_message
=
DeltaMessage
()
# Log streaming delta if output logging is enabled
if
self
.
enable_log_outputs
and
self
.
request_logger
:
...
...
@@ -993,12 +1044,18 @@ class OpenAIServingChat(OpenAIServing):
])
# Send the finish response for each request.n only once
if
auto_tools_called
or
tools_streamed
[
i
]
or
(
self
.
use_harmony
and
harmony_tools_streamed
[
i
]):
finish_reason_
=
"tool_calls"
else
:
finish_reason_
=
output
.
finish_reason
\
if
output
.
finish_reason
else
"stop"
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
delta_message
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
if
not
auto_tools_called
else
"tool_calls"
,
finish_reason
=
finish_reason_
,
stop_reason
=
output
.
stop_reason
,
token_ids
=
(
as_list
(
output
.
token_ids
)
if
request
.
return_token_ids
else
None
))
...
...
@@ -1117,6 +1174,7 @@ class OpenAIServingChat(OpenAIServing):
for
output
in
final_res
.
outputs
:
token_ids
=
output
.
token_ids
out_logprobs
=
output
.
logprobs
tool_call_info
=
None
if
request
.
logprobs
and
request
.
top_logprobs
is
not
None
:
assert
out_logprobs
is
not
None
,
"Did not output logprobs"
...
...
@@ -1131,31 +1189,42 @@ class OpenAIServingChat(OpenAIServing):
logprobs
=
None
if
self
.
use_harmony
:
reasoning_content
,
final_content
,
is_tool_call
=
(
parse_chat_output
(
token_ids
))
if
not
request
.
include_reasoning
:
reasoning_content
=
None
if
is_tool_call
:
# TODO(woosuk): Implement tool call for gpt-oss.
# For now, only Responses API supports tool call for
# gpt-oss.
raise
NotImplementedError
(
"Tool call in Chat Completion API is not supported "
"for gpt-oss yet. Please use Responses API instead."
)
if
self
.
tool_parser
is
not
None
:
tool_parser
=
self
.
tool_parser
(
tokenizer
)
# NOTE: We use token_ids for openai tool parser
tool_call_info
=
tool_parser
.
extract_tool_calls
(
""
,
request
=
request
,
token_ids
=
token_ids
,
# type: ignore
)
reasoning_content
,
content
=
None
,
tool_call_info
.
content
if
request
.
include_reasoning
:
reasoning_content
,
content
,
_
=
parse_chat_output
(
token_ids
)
message
=
ChatMessage
(
role
=
role
,
reasoning_content
=
reasoning_content
,
content
=
content
,
tool_calls
=
tool_call_info
.
tool_calls
,
)
else
:
# Normal message
reasoning_content
,
content
,
_
=
parse_chat_output
(
token_ids
)
if
not
request
.
include_reasoning
:
reasoning_content
=
None
message
=
ChatMessage
(
role
=
role
,
reasoning_content
=
reasoning_content
,
content
=
final_
content
,
content
=
content
,
)
choice_data
=
ChatCompletionResponseChoice
(
index
=
output
.
index
,
message
=
message
,
logprobs
=
logprobs
,
finish_reason
=
"tool_calls"
if
is_tool_call
else
finish_reason
=
"tool_calls"
if
(
tool_call_info
is
not
None
and
tool_call_info
.
tools_called
)
else
output
.
finish_reason
if
output
.
finish_reason
else
"stop"
,
stop_reason
=
output
.
stop_reason
,
)
...
...
@@ -1419,9 +1488,10 @@ class OpenAIServingChat(OpenAIServing):
step_top_logprobs
=
top_logprobs
[
i
]
if
step_top_logprobs
is
None
or
step_top_logprobs
.
get
(
token_id
)
is
None
:
token
=
tokenizer
.
decode
(
token_id
)
if
should_return_as_token_id
:
token
=
f
"token_id:
{
token_id
}
"
else
:
token
=
tokenizer
.
decode
(
token_id
)
logprobs_content
.
append
(
ChatCompletionLogProbsContent
(
...
...
@@ -1503,12 +1573,12 @@ class OpenAIServingChat(OpenAIServing):
messages
.
append
(
sys_msg
)
# Add developer message.
dev_msg
=
get_developer_message
()
dev_msg
=
get_developer_message
(
tools
=
request
.
tools
)
messages
.
append
(
dev_msg
)
# Add user message.
for
chat_msg
in
request
.
messages
:
messages
.
app
end
(
parse_chat_input
(
chat_msg
))
messages
.
ext
end
(
parse_chat_input
(
chat_msg
))
# Render prompt token ids.
prompt_token_ids
=
render_for_completion
(
messages
)
...
...
vllm/entrypoints/openai/serving_classification.py
View file @
38d80967
...
...
@@ -20,6 +20,7 @@ from vllm.entrypoints.openai.serving_engine import (ClassificationServeContext,
OpenAIServing
,
ServeContext
)
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.logger
import
init_logger
from
vllm.outputs
import
ClassificationOutput
,
PoolingRequestOutput
from
vllm.pooling_params
import
PoolingParams
...
...
@@ -54,14 +55,10 @@ class ClassificationMixin(OpenAIServing):
ctx
.
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
ctx
.
lora_request
)
(
ctx
.
request_prompts
,
ctx
.
engine_prompts
,
)
=
await
self
.
_preprocess_completion
(
ctx
.
request
,
ctx
.
tokenizer
,
ctx
.
request
.
input
,
)
renderer
=
self
.
_get_renderer
(
ctx
.
tokenizer
)
ctx
.
engine_prompts
=
await
renderer
.
render_prompt
(
prompt_or_prompts
=
ctx
.
request
.
input
,
config
=
self
.
_build_render_config
(
ctx
.
request
))
return
None
...
...
@@ -117,6 +114,12 @@ class ClassificationMixin(OpenAIServing):
usage
=
usage
,
)
def
_build_render_config
(
self
,
request
:
ClassificationRequest
)
->
RenderConfig
:
return
RenderConfig
(
max_length
=
self
.
max_model_len
,
truncate_prompt_tokens
=
request
.
truncate_prompt_tokens
)
class
ServingClassification
(
ClassificationMixin
):
request_id_prefix
=
"classify"
...
...
@@ -143,7 +146,7 @@ class ServingClassification(ClassificationMixin):
request
:
ClassificationRequest
,
raw_request
:
Request
,
)
->
Union
[
ClassificationResponse
,
ErrorResponse
]:
model_name
=
self
.
_get_
model_name
(
request
.
model
)
model_name
=
self
.
models
.
model_name
()
request_id
=
(
f
"
{
self
.
request_id_prefix
}
-"
f
"
{
self
.
_base_request_id
(
raw_request
)
}
"
)
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
38d80967
...
...
@@ -26,21 +26,18 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
PromptTokenUsageInfo
,
RequestResponseMetadata
,
UsageInfo
)
from
vllm.entrypoints.openai.serving_engine
import
(
EmbedsPrompt
as
ServingEngineEmbedsPrompt
)
from
vllm.entrypoints.openai.serving_engine
import
(
OpenAIServing
,
TextTokensPrompt
,
clamp_prompt_logprobs
,
is_text_tokens_prompt
)
clamp_prompt_logprobs
)
# yapf: enable
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.inputs.data
import
(
EmbedsPrompt
,
TokensPrompt
,
is_embeds_prompt
,
is_tokens_prompt
)
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.sequence
import
Logprob
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
as_list
,
merge_async_iterators
...
...
@@ -132,12 +129,12 @@ class OpenAIServingCompletion(OpenAIServing):
else
:
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
lora_request
)
renderer
=
self
.
_get_renderer
(
tokenizer
)
request_prompts
,
engine_prompts
=
await
self
.
_preprocess_completion
(
request
,
tokenizer
,
request
.
prompt
,
add_special_tokens
=
request
.
add_special_tokens
,
engine_prompts
=
await
renderer
.
render_prompt_and_embeds
(
prompt_or_prompts
=
request
.
prompt
,
prompt_embeds
=
request
.
prompt_embeds
,
config
=
self
.
_build_render_config
(
request
),
)
except
ValueError
as
e
:
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
...
...
@@ -198,7 +195,7 @@ class OpenAIServingCompletion(OpenAIServing):
self
.
_log_inputs
(
request_id_item
,
request
_prompt
s
[
i
]
,
engine
_prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -235,7 +232,7 @@ class OpenAIServingCompletion(OpenAIServing):
result_generator
=
merge_async_iterators
(
*
generators
)
model_name
=
self
.
_get_
model_name
(
request
.
model
,
lora_request
)
model_name
=
self
.
models
.
model_name
(
lora_request
)
num_prompts
=
len
(
engine_prompts
)
# Similar to the OpenAI API, when n != best_of, we do not stream the
...
...
@@ -249,7 +246,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
stream
:
return
self
.
completion_stream_generator
(
request
,
request
_prompts
,
engine
_prompts
,
result_generator
,
request_id
,
created_time
,
...
...
@@ -273,11 +270,9 @@ class OpenAIServingCompletion(OpenAIServing):
# We did not pass it into vLLM engine to avoid being redundant
# with the inputs token IDs
if
final_res
.
prompt
is
None
:
request_prompt
=
request_prompts
[
i
]
if
is_text_tokens_prompt
(
request_prompt
):
final_res
.
prompt
=
request_prompt
[
"prompt"
]
else
:
final_res
.
prompt
=
None
engine_prompt
=
engine_prompts
[
i
]
final_res
.
prompt
=
None
if
is_embeds_prompt
(
engine_prompt
)
else
engine_prompt
.
get
(
"prompt"
)
final_res_batch_checked
=
cast
(
list
[
RequestOutput
],
final_res_batch
)
...
...
@@ -313,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
completion_stream_generator
(
self
,
request
:
CompletionRequest
,
request_prompts
:
list
[
Union
[
TextTokensPrompt
,
ServingEngineEmbedsPrompt
]],
engine_prompts
:
list
[
Union
[
TokensPrompt
,
EmbedsPrompt
]],
result_generator
:
AsyncIterator
[
tuple
[
int
,
RequestOutput
]],
request_id
:
str
,
created_time
:
int
,
...
...
@@ -350,14 +344,11 @@ class OpenAIServingCompletion(OpenAIServing):
num_cached_tokens
=
res
.
num_cached_tokens
first_iteration
=
False
if
res
.
prompt
is
not
None
:
prompt_text
=
res
.
prompt
else
:
request_prompt
=
request_prompts
[
prompt_idx
]
if
is_text_tokens_prompt
(
request_prompt
):
prompt_text
=
request_prompt
[
"prompt"
]
else
:
prompt_text
=
None
prompt_text
=
res
.
prompt
if
prompt_text
is
None
:
engine_prompt
=
engine_prompts
[
prompt_idx
]
prompt_text
=
None
if
is_embeds_prompt
(
engine_prompt
)
else
engine_prompt
.
get
(
"prompt"
)
# Prompt details are excluded from later streamed outputs
if
prompt_token_ids
is
not
None
:
...
...
@@ -378,6 +369,8 @@ class OpenAIServingCompletion(OpenAIServing):
assert
request
.
max_tokens
is
not
None
if
request
.
echo
and
not
has_echoed
[
i
]:
assert
prompt_token_ids
is
not
None
if
request
.
return_token_ids
:
prompt_text
=
""
assert
prompt_text
is
not
None
if
request
.
max_tokens
==
0
:
# only return the prompt
...
...
@@ -525,6 +518,8 @@ class OpenAIServingCompletion(OpenAIServing):
for
output
in
final_res
.
outputs
:
assert
request
.
max_tokens
is
not
None
if
request
.
echo
:
if
request
.
return_token_ids
:
prompt_text
=
""
assert
prompt_text
is
not
None
if
request
.
max_tokens
==
0
:
token_ids
=
prompt_token_ids
...
...
@@ -676,3 +671,18 @@ class OpenAIServingCompletion(OpenAIServing):
tokens
=
out_tokens
,
top_logprobs
=
out_top_logprobs
,
)
def
_build_render_config
(
self
,
request
:
CompletionRequest
,
max_input_length
:
Optional
[
int
]
=
None
,
)
->
RenderConfig
:
max_input_tokens_len
=
self
.
max_model_len
-
(
request
.
max_tokens
or
0
)
return
RenderConfig
(
max_length
=
max_input_tokens_len
,
truncate_prompt_tokens
=
request
.
truncate_prompt_tokens
,
add_special_tokens
=
request
.
add_special_tokens
,
cache_salt
=
request
.
cache_salt
,
needs_detokenization
=
bool
(
request
.
echo
and
not
request
.
return_token_ids
),
)
vllm/entrypoints/openai/serving_embedding.py
View file @
38d80967
...
...
@@ -24,12 +24,11 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
ErrorResponse
,
UsageInfo
)
from
vllm.entrypoints.openai.serving_engine
import
(
EmbeddingServeContext
,
OpenAIServing
,
RequestPrompt
,
ServeContext
,
TextTokensPrompt
)
# yapf: enable
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.
inputs.data
import
EmbedsPrompt
as
EngineEmbedsPrompt
from
vllm.
entrypoints.renderer
import
RenderConfig
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.logger
import
init_logger
from
vllm.outputs
import
(
EmbeddingOutput
,
EmbeddingRequestOutput
,
...
...
@@ -79,11 +78,12 @@ class EmbeddingMixin(OpenAIServing):
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
ctx
.
lora_request
)
renderer
=
self
.
_get_renderer
(
tokenizer
)
if
isinstance
(
ctx
.
request
,
EmbeddingChatRequest
):
(
_
,
ctx
.
request_prompts
,
_
,
ctx
.
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
ctx
.
request
,
...
...
@@ -93,25 +93,33 @@ class EmbeddingMixin(OpenAIServing):
or
ctx
.
chat_template
,
chat_template_content_format
=
ctx
.
chat_template_content_format
,
# In embedding requests, we are not generating tokens,
# so there is no need to append extra tokens to the input
add_generation_prompt
=
False
,
add_generation_prompt
=
ctx
.
request
.
add_generation_prompt
,
continue_final_message
=
False
,
add_special_tokens
=
ctx
.
request
.
add_special_tokens
,
)
else
:
(
ctx
.
request_prompts
,
ctx
.
engine_prompts
)
=
await
self
.
_preprocess_completion
(
ctx
.
request
,
tokenizer
,
ctx
.
request
.
input
,
add_special_tokens
=
ctx
.
request
.
add_special_tokens
,
)
ctx
.
engine_prompts
=
await
renderer
.
render_prompt
(
prompt_or_prompts
=
ctx
.
request
.
input
,
config
=
self
.
_build_render_config
(
ctx
.
request
),
)
return
None
except
(
ValueError
,
TypeError
)
as
e
:
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
str
(
e
))
def
_build_render_config
(
self
,
request
:
EmbeddingCompletionRequest
)
->
RenderConfig
:
# Set max_length based on chunked processing capability
if
self
.
_should_use_chunked_processing
(
request
):
max_length
=
None
else
:
max_length
=
self
.
max_embed_len
or
self
.
max_model_len
return
RenderConfig
(
max_length
=
max_length
,
truncate_prompt_tokens
=
request
.
truncate_prompt_tokens
,
add_special_tokens
=
request
.
add_special_tokens
)
@
override
def
_build_response
(
self
,
...
...
@@ -287,8 +295,7 @@ class EmbeddingMixin(OpenAIServing):
async
def
_create_single_prompt_generator
(
self
,
ctx
:
EmbeddingServeContext
,
engine_prompt
:
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
],
request_prompt
:
RequestPrompt
,
engine_prompt
:
EngineTokensPrompt
,
pooling_params
:
PoolingParams
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]],
prompt_index
:
int
,
...
...
@@ -297,16 +304,10 @@ class EmbeddingMixin(OpenAIServing):
request_id_item
=
f
"
{
ctx
.
request_id
}
-
{
prompt_index
}
"
self
.
_log_inputs
(
request_id_item
,
request
_prompt
,
engine
_prompt
,
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
)
# Mypy has an existing bug related to inferring the variance
# of TypedDicts with `builtins.enumerate`:
# https://github.com/python/mypy/issues/8586#issuecomment-2867698435
engine_prompt
=
cast
(
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
],
engine_prompt
)
# Return the original generator without wrapping
return
self
.
engine_client
.
encode
(
engine_prompt
,
...
...
@@ -355,20 +356,14 @@ class EmbeddingMixin(OpenAIServing):
return
self
.
create_error_response
(
"Engine prompts not available"
)
if
ctx
.
request_prompts
is
None
:
return
self
.
create_error_response
(
"Request prompts not available"
)
max_pos_embeddings
=
self
.
_get_max_position_embeddings
()
for
i
,
engine_prompt
in
enumerate
(
ctx
.
engine_prompts
):
request_prompt
=
ctx
.
request_prompts
[
i
]
# Check if this specific prompt needs chunked processing
if
self
.
_is_text_tokens_prompt
(
request
_prompt
):
if
self
.
_is_text_tokens_prompt
(
engine
_prompt
):
# Cast to TextTokensPrompt since we've verified
# prompt_token_ids
text_tokens_prompt
=
cast
(
TextTokensPrompt
,
request
_prompt
)
text_tokens_prompt
=
cast
(
TextTokensPrompt
,
engine
_prompt
)
if
(
len
(
text_tokens_prompt
[
"prompt_token_ids"
])
>
max_pos_embeddings
):
# Use chunked processing for this prompt
...
...
@@ -379,13 +374,8 @@ class EmbeddingMixin(OpenAIServing):
continue
# Normal processing for short prompts or non-token prompts
# Cast engine_prompt to the expected type for mypy
engine_prompt_typed
=
cast
(
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
],
engine_prompt
)
generator
=
await
self
.
_create_single_prompt_generator
(
ctx
,
engine_prompt_typed
,
request_prompt
,
pooling_params
,
trace_headers
,
i
)
ctx
,
engine_prompt
,
pooling_params
,
trace_headers
,
i
)
generators
.
append
(
generator
)
from
vllm.utils
import
merge_async_iterators
...
...
@@ -421,10 +411,6 @@ class EmbeddingMixin(OpenAIServing):
if
not
use_chunked
:
return
await
super
().
_collect_batch
(
ctx
=
ctx
)
if
ctx
.
request_prompts
is
None
:
return
self
.
create_error_response
(
"Request prompts not available"
)
if
ctx
.
result_generator
is
None
:
return
self
.
create_error_response
(
"Result generator not available"
)
...
...
@@ -540,7 +526,7 @@ class EmbeddingMixin(OpenAIServing):
data
=
final_embedding
)
# Get original prompt token IDs for this prompt
original_prompt
=
ctx
.
request
_prompts
[
prompt_idx
]
original_prompt
=
ctx
.
engine
_prompts
[
prompt_idx
]
if
not
self
.
_is_text_tokens_prompt
(
original_prompt
):
return
self
.
create_error_response
(
f
"Chunked prompt
{
prompt_idx
}
is not a "
...
...
@@ -613,7 +599,7 @@ class OpenAIServingEmbedding(EmbeddingMixin):
See https://platform.openai.com/docs/api-reference/embeddings/create
for the API specification. This API mimics the OpenAI Embedding API.
"""
model_name
=
self
.
_get_
model_name
(
request
.
model
)
model_name
=
self
.
models
.
model_name
()
request_id
=
(
f
"
{
self
.
request_id_prefix
}
-"
f
"
{
self
.
_base_request_id
(
raw_request
,
request
.
request_id
)
}
"
)
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
io
import
json
import
sys
import
time
...
...
@@ -9,10 +7,8 @@ import traceback
from
collections.abc
import
AsyncGenerator
,
Iterable
,
Mapping
,
Sequence
from
concurrent.futures
import
ThreadPoolExecutor
from
http
import
HTTPStatus
from
typing
import
(
Annotated
,
Any
,
Callable
,
ClassVar
,
Generic
,
Optional
,
TypeVar
,
Union
,
cast
,
overload
)
from
typing
import
Any
,
Callable
,
ClassVar
,
Generic
,
Optional
,
TypeVar
,
Union
import
pybase64
import
torch
from
fastapi
import
Request
from
pydantic
import
BaseModel
,
ConfigDict
,
Field
...
...
@@ -62,18 +58,19 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
TranslationRequest
)
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.openai.tool_parsers
import
ToolParser
from
vllm.entrypoints.renderer
import
(
BaseRenderer
,
CompletionRenderer
,
RenderConfig
)
# yapf: enable
from
vllm.inputs.data
import
EmbedsPrompt
as
EngineEmbeds
Prompt
from
vllm.inputs.data
import
Prompt
Type
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.parse
import
parse_and_batch_prompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
(
# noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
MultiModalDataDict
,
MultiModalUUIDDict
)
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.sequence
import
Logprob
,
PromptLogprobs
from
vllm.tracing
import
(
contains_trace_headers
,
extract_trace_headers
,
log_tracing_disabled_warning
)
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
MistralTokenizer
...
...
@@ -82,16 +79,26 @@ from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
logger
=
init_logger
(
__name__
)
CompletionLikeRequest
=
Union
[
CompletionRequest
,
DetokenizeRequest
,
EmbeddingCompletionRequest
,
RerankRequest
,
ClassificationRequest
,
ScoreRequest
,
TokenizeCompletionRequest
]
CompletionLikeRequest
=
Union
[
CompletionRequest
,
DetokenizeRequest
,
EmbeddingCompletionRequest
,
RerankRequest
,
ClassificationRequest
,
ScoreRequest
,
TokenizeCompletionRequest
,
]
ChatLikeRequest
=
Union
[
ChatCompletionRequest
,
EmbeddingChatRequest
,
TokenizeChatRequest
]
SpeechToTextRequest
=
Union
[
TranscriptionRequest
,
TranslationRequest
]
AnyRequest
=
Union
[
CompletionLikeRequest
,
ChatLikeRequest
,
SpeechToTextRequest
,
ResponsesRequest
,
IOProcessorRequest
]
AnyRequest
=
Union
[
CompletionLikeRequest
,
ChatLikeRequest
,
SpeechToTextRequest
,
ResponsesRequest
,
IOProcessorRequest
,
]
AnyResponse
=
Union
[
CompletionResponse
,
...
...
@@ -135,9 +142,9 @@ class RequestProcessingMixin(BaseModel):
Mixin for request processing,
handling prompt preparation and engine input.
"""
request_prompts
:
Optional
[
Sequence
[
RequestPrompt
]]
=
[]
engine_prompts
:
Optional
[
Union
[
list
[
EngineTokensPrompt
],
list
[
EngineEmbedsPrompt
]]]
=
[]
engine_prompts
:
Optional
[
list
[
EngineTokensPrompt
]]
=
[]
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
...
...
@@ -147,6 +154,7 @@ class ResponseGenerationMixin(BaseModel):
Mixin for response generation,
managing result generators and final batch results.
"""
result_generator
:
Optional
[
AsyncGenerator
[
tuple
[
int
,
Union
[
RequestOutput
,
PoolingRequestOutput
]],
None
]]
=
None
final_res_batch
:
list
[
Union
[
RequestOutput
,
PoolingRequestOutput
]]
=
Field
(
...
...
@@ -155,8 +163,12 @@ class ResponseGenerationMixin(BaseModel):
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
class
ServeContext
(
RequestProcessingMixin
,
ResponseGenerationMixin
,
BaseModel
,
Generic
[
RequestT
]):
class
ServeContext
(
RequestProcessingMixin
,
ResponseGenerationMixin
,
BaseModel
,
Generic
[
RequestT
],
):
# Shared across all requests
request
:
RequestT
raw_request
:
Optional
[
Request
]
=
None
...
...
@@ -227,6 +239,29 @@ class OpenAIServing:
AsyncMicrobatchTokenizer
]
=
{}
self
.
log_error_stack
=
log_error_stack
def
_get_renderer
(
self
,
tokenizer
:
Optional
[
AnyTokenizer
])
->
BaseRenderer
:
"""
Get a Renderer instance with the provided tokenizer.
Uses shared async tokenizer pool for efficiency.
"""
return
CompletionRenderer
(
model_config
=
self
.
model_config
,
tokenizer
=
tokenizer
,
async_tokenizer_pool
=
self
.
_async_tokenizer_pool
)
def
_build_render_config
(
self
,
request
:
Any
,
)
->
RenderConfig
:
"""
Build and return a `RenderConfig` for an endpoint.
Used by the renderer to control how prompts are prepared
(e.g., tokenization and length handling). Endpoints should
implement this with logic appropriate to their request type.
"""
raise
NotImplementedError
def
_get_async_tokenizer
(
self
,
tokenizer
)
->
AsyncMicrobatchTokenizer
:
"""
Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
...
...
@@ -298,8 +333,8 @@ class OpenAIServing:
truncate_prompt_tokens
=
getattr
(
ctx
.
request
,
"truncate_prompt_tokens"
,
None
)
if
truncate_prompt_tokens
is
not
None
and
\
truncate_prompt_tokens
>
self
.
max_model_len
:
if
(
truncate_prompt_tokens
is
not
None
and
truncate_prompt_tokens
>
self
.
max_model_len
)
:
return
self
.
create_error_response
(
"truncate_prompt_tokens value is "
"greater than max_model_len."
...
...
@@ -340,21 +375,13 @@ class OpenAIServing:
for
i
,
engine_prompt
in
enumerate
(
ctx
.
engine_prompts
):
request_id_item
=
f
"
{
ctx
.
request_id
}
-
{
i
}
"
if
ctx
.
request_prompts
is
None
:
return
self
.
create_error_response
(
"Request prompts not available"
)
self
.
_log_inputs
(
request_id_item
,
ctx
.
request_prompts
[
i
],
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
)
# Mypy has an existing bug related to inferring the variance of
# TypedDicts with `builtins.enumerate`:
# https://github.com/python/mypy/issues/8586#issuecomment-2867698435
engine_prompt
=
cast
(
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
],
engine_prompt
)
self
.
_log_inputs
(
request_id_item
,
engine_prompt
,
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
,
)
generator
=
self
.
engine_client
.
encode
(
engine_prompt
,
pooling_params
,
...
...
@@ -410,10 +437,11 @@ class OpenAIServing:
return
self
.
create_error_response
(
str
(
e
))
def
create_error_response
(
self
,
message
:
str
,
err_type
:
str
=
"BadRequestError"
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
)
->
ErrorResponse
:
self
,
message
:
str
,
err_type
:
str
=
"BadRequestError"
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
,
)
->
ErrorResponse
:
if
self
.
log_error_stack
:
exc_type
,
_
,
_
=
sys
.
exc_info
()
if
exc_type
is
not
None
:
...
...
@@ -424,10 +452,11 @@ class OpenAIServing:
message
=
message
,
type
=
err_type
,
code
=
status_code
.
value
))
def
create_streaming_error_response
(
self
,
message
:
str
,
err_type
:
str
=
"BadRequestError"
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
)
->
str
:
self
,
message
:
str
,
err_type
:
str
=
"BadRequestError"
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
,
)
->
str
:
json_str
=
json
.
dumps
(
self
.
create_error_response
(
message
=
message
,
err_type
=
err_type
,
...
...
@@ -438,25 +467,25 @@ class OpenAIServing:
self
,
request
:
AnyRequest
,
)
->
Optional
[
ErrorResponse
]:
error_response
=
None
if
self
.
_is_model_supported
(
request
.
model
):
return
None
if
request
.
model
in
self
.
models
.
lora_requests
:
return
None
if
envs
.
VLLM_ALLOW_RUNTIME_LORA_UPDATING
and
request
.
model
and
(
load_result
:
=
await
self
.
models
.
resolve_lora
(
request
.
model
)):
if
(
envs
.
VLLM_ALLOW_RUNTIME_LORA_UPDATING
and
request
.
model
and
(
load_result
:
=
await
self
.
models
.
resolve_lora
(
request
.
model
))
)
:
if
isinstance
(
load_result
,
LoRARequest
):
return
None
if
isinstance
(
load_result
,
ErrorResponse
)
and
\
load_result
.
error
.
code
==
HTTPStatus
.
BAD_REQUEST
.
value
:
if
(
isinstance
(
load_result
,
ErrorResponse
)
and
load_result
.
error
.
code
==
HTTPStatus
.
BAD_REQUEST
.
value
)
:
error_response
=
load_result
return
error_response
or
self
.
create_error_response
(
message
=
f
"The model `
{
request
.
model
}
` does not exist."
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
)
status_code
=
HTTPStatus
.
NOT_FOUND
,
)
def
_get_active_default_mm_loras
(
self
,
request
:
AnyRequest
)
->
Optional
[
LoRARequest
]:
...
...
@@ -487,7 +516,6 @@ class OpenAIServing:
request
:
AnyRequest
,
supports_default_mm_loras
:
bool
=
False
,
)
->
Optional
[
LoRARequest
]:
if
request
.
model
in
self
.
models
.
lora_requests
:
return
self
.
models
.
lora_requests
[
request
.
model
]
...
...
@@ -548,13 +576,15 @@ class OpenAIServing:
prompt
,
add_special_tokens
=
add_special_tokens
,
truncation
=
True
,
max_length
=
self
.
max_model_len
)
max_length
=
self
.
max_model_len
,
)
else
:
encoded
=
await
async_tokenizer
(
prompt
,
add_special_tokens
=
add_special_tokens
,
truncation
=
True
,
max_length
=
truncate_prompt_tokens
)
max_length
=
truncate_prompt_tokens
,
)
input_ids
=
encoded
.
input_ids
input_text
=
prompt
...
...
@@ -595,16 +625,22 @@ class OpenAIServing:
# Note: EmbeddingRequest, ClassificationRequest,
# and ScoreRequest doesn't have max_tokens
if
isinstance
(
request
,
(
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
ScoreRequest
,
RerankRequest
,
ClassificationRequest
)):
if
isinstance
(
request
,
(
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
ScoreRequest
,
RerankRequest
,
ClassificationRequest
,
),
):
# Note: input length can be up to the entire model context length
# since these requests don't generate tokens.
if
token_num
>
self
.
max_model_len
:
operations
:
dict
[
type
[
AnyRequest
],
str
]
=
{
ScoreRequest
:
"score"
,
ClassificationRequest
:
"classification"
ClassificationRequest
:
"classification"
,
}
operation
=
operations
.
get
(
type
(
request
),
"embedding generation"
)
...
...
@@ -618,8 +654,11 @@ class OpenAIServing:
# Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
# and does not require model context length validation
if
isinstance
(
request
,
(
TokenizeCompletionRequest
,
TokenizeChatRequest
,
DetokenizeRequest
)):
if
isinstance
(
request
,
(
TokenizeCompletionRequest
,
TokenizeChatRequest
,
DetokenizeRequest
),
):
return
TextTokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
...
...
@@ -639,8 +678,8 @@ class OpenAIServing:
f
"
{
token_num
}
input tokens. Please reduce the length of "
"the input messages."
)
if
max_tokens
is
not
None
and
\
token_num
+
max_tokens
>
self
.
max_model_len
:
if
(
max_tokens
is
not
None
and
token_num
+
max_tokens
>
self
.
max_model_len
)
:
raise
ValueError
(
"'max_tokens' or 'max_completion_tokens' is too large: "
f
"
{
max_tokens
}
. This model's maximum context length is "
...
...
@@ -698,156 +737,6 @@ class OpenAIServing:
tokenizer
=
tokenizer
,
)
async
def
_tokenize_prompt_input_or_inputs_async
(
self
,
request
:
AnyRequest
,
tokenizer
:
Optional
[
AnyTokenizer
],
input_or_inputs
:
Optional
[
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]]],
add_special_tokens
:
bool
=
True
,
)
->
tuple
[
list
[
TextTokensPrompt
],
list
[
EmbedsPrompt
]]:
"""
Tokenize/detokenize depending on the input format.
According to `OpenAI API <https://platform.openai.com/docs/api-reference/embeddings/create>`_
, each input can be a string or array of tokens. Note that each request
can pass one or more inputs.
"""
inputs_embeds
=
list
[
EmbedsPrompt
]()
inputs_text
=
list
[
TextTokensPrompt
]()
truncate_prompt_tokens
=
getattr
(
request
,
"truncate_prompt_tokens"
,
None
)
if
(
truncate_prompt_tokens
or
0
)
<
0
:
truncate_prompt_tokens
=
self
.
max_model_len
if
(
isinstance
(
request
,
CompletionRequest
)
and
request
.
prompt_embeds
is
not
None
):
inputs_embeds
.
extend
(
self
.
_load_prompt_embeds
(
request
.
prompt_embeds
,
truncate_prompt_tokens
))
# Empty prompts are okay as long as there are prompt embeddings
if
input_or_inputs
is
None
or
(
inputs_embeds
and
input_or_inputs
==
""
):
return
[],
inputs_embeds
# Although our type checking is based on mypy,
# VSCode Pyright extension should still work properly
# "is False" is required for Pyright to perform type narrowing
# See: https://github.com/microsoft/pyright/issues/7672
# Parse and batch the input prompts
batch_inputs
=
parse_and_batch_prompt
(
input_or_inputs
)
# Process each input in the batch concurrently
tasks
=
[]
for
prompt_input
in
batch_inputs
:
if
prompt_input
[
"is_tokens"
]
is
False
:
assert
tokenizer
is
not
None
,
\
"Tokenizer is required for text prompts"
task
=
self
.
_normalize_prompt_text_to_input
(
request
,
prompt_input
[
"content"
],
tokenizer
=
tokenizer
,
add_special_tokens
=
add_special_tokens
)
else
:
task
=
self
.
_normalize_prompt_tokens_to_input
(
request
,
prompt_input
[
"content"
],
tokenizer
=
tokenizer
)
tasks
.
append
(
task
)
# Wait for all tokenization tasks to complete
results
=
await
asyncio
.
gather
(
*
tasks
)
inputs_text
.
extend
(
results
)
return
inputs_text
,
inputs_embeds
@
overload
async
def
_preprocess_completion
(
self
,
request
:
Union
[
DetokenizeRequest
,
EmbeddingCompletionRequest
,
RerankRequest
,
ClassificationRequest
,
ScoreRequest
,
TokenizeCompletionRequest
],
tokenizer
:
Optional
[
AnyTokenizer
],
input_or_inputs
:
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]],
add_special_tokens
:
bool
=
...,
)
->
tuple
[
list
[
TextTokensPrompt
],
list
[
EngineTokensPrompt
]]:
...
@
overload
async
def
_preprocess_completion
(
self
,
request
:
CompletionRequest
,
tokenizer
:
Optional
[
AnyTokenizer
],
input_or_inputs
:
Optional
[
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]]],
add_special_tokens
:
bool
=
...,
)
->
tuple
[
list
[
Union
[
TextTokensPrompt
,
EmbedsPrompt
]],
list
[
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
]]]:
...
async
def
_preprocess_completion
(
self
,
request
:
CompletionLikeRequest
,
tokenizer
:
Optional
[
AnyTokenizer
],
input_or_inputs
:
Optional
[
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]]],
add_special_tokens
:
bool
=
True
,
)
->
tuple
[
Union
[
list
[
TextTokensPrompt
],
list
[
Union
[
TextTokensPrompt
,
EmbedsPrompt
]]],
Union
[
list
[
EngineTokensPrompt
],
list
[
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
]]]]:
if
not
isinstance
(
request
,
CompletionRequest
)
and
input_or_inputs
is
None
:
raise
ValueError
(
"Prompt embeds with non-completion requests is not"
" currently supported."
)
(
request_prompts_text
,
request_prompts_embeds
)
=
await
self
.
_tokenize_prompt_input_or_inputs_async
(
request
,
tokenizer
,
input_or_inputs
,
add_special_tokens
=
add_special_tokens
,
)
engine_prompts_text
=
[
EngineTokensPrompt
(
prompt_token_ids
=
request_prompt_text
[
"prompt_token_ids"
])
for
request_prompt_text
in
request_prompts_text
]
cache_salt
=
request
.
cache_salt
if
(
hasattr
(
request
,
"cache_salt"
)
and
request
.
cache_salt
is
not
None
)
else
None
if
cache_salt
:
for
prompt_text
in
engine_prompts_text
:
prompt_text
[
"cache_salt"
]
=
cache_salt
# This check is equivalent to simply checking if
# `request_prompts_embeds` is empty, but it's difficult to propagate
# overloads to the private helper functions to enable this check.
# This overload is needed because only TextPrompts are allowed for
# non-completion requests and if we don't add the overload here,
# everywhere this function is used outside of serving_completion will
# need logic asserting that only text prompts are in the request.
if
not
isinstance
(
request
,
CompletionRequest
)
and
input_or_inputs
is
not
None
:
return
request_prompts_text
,
engine_prompts_text
engine_prompts_embeds
=
[
EngineEmbedsPrompt
(
prompt_embeds
=
request_prompt_embeds
[
"prompt_embeds"
])
for
request_prompt_embeds
in
request_prompts_embeds
]
if
cache_salt
:
for
prompt_embed
in
engine_prompts_embeds
:
prompt_embed
[
"cache_salt"
]
=
cache_salt
request_prompts
=
request_prompts_embeds
+
request_prompts_text
engine_prompts
=
engine_prompts_embeds
+
engine_prompts_text
return
request_prompts
,
engine_prompts
async
def
_preprocess_chat
(
self
,
request
:
Union
[
ChatLikeRequest
,
ResponsesRequest
],
...
...
@@ -862,8 +751,11 @@ class OpenAIServing:
chat_template_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
tool_parser
:
Optional
[
Callable
[[
AnyTokenizer
],
ToolParser
]]
=
None
,
add_special_tokens
:
bool
=
False
,
)
->
tuple
[
list
[
ConversationMessage
],
Sequence
[
RequestPrompt
],
list
[
EngineTokensPrompt
]]:
)
->
tuple
[
list
[
ConversationMessage
],
Sequence
[
RequestPrompt
],
list
[
EngineTokensPrompt
],
]:
model_config
=
self
.
model_config
resolved_content_format
=
resolve_chat_template_content_format
(
...
...
@@ -873,7 +765,7 @@ class OpenAIServing:
tokenizer
,
model_config
=
model_config
,
)
conversation
,
mm_data_future
=
parse_chat_messages_futures
(
conversation
,
mm_data_future
,
mm_uuids
=
parse_chat_messages_futures
(
messages
,
model_config
,
tokenizer
,
...
...
@@ -925,8 +817,8 @@ class OpenAIServing:
if
tokenizer
is
None
:
assert
isinstance
(
request_prompt
,
str
),
(
"Prompt has to be a string"
,
\
"when the tokenizer is not initialised"
"Prompt has to be a string"
,
"when the tokenizer is not initialised"
,
)
prompt_inputs
=
TextTokensPrompt
(
prompt
=
request_prompt
,
prompt_token_ids
=
[
1
])
...
...
@@ -943,12 +835,17 @@ class OpenAIServing:
"Prompt has to be either a string or a list of token ids"
)
prompt_inputs
=
TextTokensPrompt
(
prompt
=
tokenizer
.
decode
(
request_prompt
),
prompt_token_ids
=
request_prompt
)
prompt_token_ids
=
request_prompt
,
)
engine_prompt
=
EngineTokensPrompt
(
prompt_token_ids
=
prompt_inputs
[
"prompt_token_ids"
])
if
mm_data
is
not
None
:
engine_prompt
[
"multi_modal_data"
]
=
mm_data
if
mm_uuids
is
not
None
:
engine_prompt
[
"multi_modal_uuids"
]
=
mm_uuids
if
request
.
mm_processor_kwargs
is
not
None
:
engine_prompt
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
...
...
@@ -1007,49 +904,15 @@ class OpenAIServing:
prompt_token_ids
=
prompt_token_ids
)
request_prompt
=
prompt_token_ids
# Update the sampling params.
sampling_params
.
max_tokens
=
(
self
.
max_model_len
-
len
(
prompt_token_ids
)
)
sampling_params
.
max_tokens
=
self
.
max_model_len
-
len
(
prompt_token_ids
)
# OPTIMIZATION
priority
=
orig_priority
-
1
@
staticmethod
def
_load_prompt_embeds
(
prompt_embeds
:
Optional
[
Union
[
bytes
,
list
[
bytes
]]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
)
->
list
[
EmbedsPrompt
]:
def
_load_and_validate_embed
(
embed
:
bytes
)
->
EmbedsPrompt
:
tensor
=
torch
.
load
(
io
.
BytesIO
(
pybase64
.
b64decode
(
embed
,
validate
=
True
)),
weights_only
=
True
,
map_location
=
torch
.
device
(
"cpu"
))
assert
isinstance
(
tensor
,
torch
.
Tensor
)
and
tensor
.
dtype
in
(
torch
.
float32
,
torch
.
bfloat16
,
torch
.
float16
,
)
tensor
=
tensor
.
to_dense
()
if
tensor
.
dim
()
>
2
:
tensor
=
tensor
.
squeeze
(
0
)
assert
tensor
.
dim
()
==
2
if
truncate_prompt_tokens
is
not
None
:
tensor
=
tensor
[
-
truncate_prompt_tokens
:]
return
{
"prompt_embeds"
:
tensor
}
if
prompt_embeds
:
if
isinstance
(
prompt_embeds
,
list
):
return
[
_load_and_validate_embed
(
embed
)
for
embed
in
prompt_embeds
]
else
:
return
[
_load_and_validate_embed
(
prompt_embeds
)]
else
:
return
[]
def
_log_inputs
(
self
,
request_id
:
str
,
inputs
:
RequestPrompt
,
inputs
:
Union
[
RequestPrompt
,
PromptType
],
params
:
Optional
[
Union
[
SamplingParams
,
PoolingParams
,
BeamSearchParams
]],
lora_request
:
Optional
[
LoRARequest
],
...
...
@@ -1061,11 +924,9 @@ class OpenAIServing:
prompt
=
inputs
elif
isinstance
(
inputs
,
list
):
prompt_token_ids
=
inputs
elif
'prompt_embeds'
in
inputs
:
prompt_embeds
=
inputs
.
get
(
"prompt_embeds"
)
else
:
prompt
=
inputs
[
"
prompt
"
]
prompt_token_ids
=
inputs
[
"
prompt_token_ids
"
]
prompt
=
getattr
(
inputs
,
'
prompt
'
,
None
)
prompt_token_ids
=
getattr
(
inputs
,
'
prompt_token_ids
'
,
None
)
self
.
request_logger
.
log_inputs
(
request_id
,
...
...
@@ -1101,10 +962,12 @@ class OpenAIServing:
return
raw_request
.
headers
.
get
(
"X-Request-Id"
,
default
)
@
staticmethod
def
_get_decoded_token
(
logprob
:
Logprob
,
token_id
:
int
,
tokenizer
:
AnyTokenizer
,
return_as_token_id
:
bool
=
False
)
->
str
:
def
_get_decoded_token
(
logprob
:
Logprob
,
token_id
:
int
,
tokenizer
:
AnyTokenizer
,
return_as_token_id
:
bool
=
False
,
)
->
str
:
if
return_as_token_id
:
return
f
"token_id:
{
token_id
}
"
...
...
@@ -1117,19 +980,10 @@ class OpenAIServing:
return
True
return
self
.
models
.
is_base_model
(
model_name
)
def
_get_model_name
(
self
,
model_name
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
str
:
if
lora_request
:
return
lora_request
.
lora_name
if
not
model_name
:
return
self
.
models
.
base_model_paths
[
0
].
name
return
model_name
def
clamp_prompt_logprobs
(
prompt_logprobs
:
Union
[
PromptLogprobs
,
None
])
->
Union
[
PromptLogprobs
,
None
]:
None
]
,
)
->
Union
[
PromptLogprobs
,
None
]:
if
prompt_logprobs
is
None
:
return
prompt_logprobs
...
...
@@ -1137,6 +991,6 @@ def clamp_prompt_logprobs(
if
logprob_dict
is
None
:
continue
for
logprob_values
in
logprob_dict
.
values
():
if
logprob_values
.
logprob
==
float
(
'
-inf
'
):
if
logprob_values
.
logprob
==
float
(
"
-inf
"
):
logprob_values
.
logprob
=
-
9999.0
return
prompt_logprobs
vllm/entrypoints/openai/serving_pooling.py
View file @
38d80967
...
...
@@ -4,7 +4,7 @@
import
asyncio
import
base64
import
time
from
collections.abc
import
AsyncGenerator
,
Sequence
from
collections.abc
import
AsyncGenerator
from
typing
import
Final
,
Literal
,
Optional
,
Union
,
cast
import
jinja2
...
...
@@ -26,8 +26,9 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
PoolingRequest
,
PoolingResponse
,
PoolingResponseData
,
UsageInfo
)
# yapf: enable
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
,
RequestPrompt
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.entrypoints.utils
import
_validate_truncation_size
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingOutput
,
PoolingRequestOutput
...
...
@@ -90,7 +91,7 @@ class OpenAIServingPooling(OpenAIServing):
if
error_check_ret
is
not
None
:
return
error_check_ret
model_name
=
self
.
_get_
model_name
(
request
.
model
)
model_name
=
self
.
models
.
model_name
()
request_id
=
f
"pool-
{
self
.
_base_request_id
(
raw_request
)
}
"
created_time
=
int
(
time
.
time
())
...
...
@@ -104,6 +105,7 @@ class OpenAIServingPooling(OpenAIServing):
else
:
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
lora_request
)
renderer
=
self
.
_get_renderer
(
tokenizer
)
if
getattr
(
request
,
"dimensions"
,
None
)
is
not
None
:
return
self
.
create_error_response
(
...
...
@@ -126,14 +128,11 @@ class OpenAIServingPooling(OpenAIServing):
engine_prompts
=
await
self
.
io_processor
.
pre_process_async
(
prompt
=
validated_prompt
,
request_id
=
request_id
)
request_prompts
:
Sequence
[
RequestPrompt
]
=
[
""
]
*
len
(
engine_prompts
)
elif
isinstance
(
request
,
PoolingChatRequest
):
(
_
,
request_prompts
,
_
,
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
request
,
...
...
@@ -149,13 +148,10 @@ class OpenAIServingPooling(OpenAIServing):
add_special_tokens
=
request
.
add_special_tokens
,
)
elif
isinstance
(
request
,
PoolingCompletionRequest
):
(
request_prompts
,
engine_prompts
)
=
await
self
.
_preprocess_completion
(
request
,
tokenizer
,
request
.
input
,
add_special_tokens
=
request
.
add_special_tokens
,
)
engine_prompts
=
await
renderer
.
render_prompt
(
prompt_or_prompts
=
request
.
input
,
config
=
self
.
_build_render_config
(
request
),
)
else
:
raise
ValueError
(
f
"Unsupported request of type
{
type
(
request
)
}
"
)
...
...
@@ -177,7 +173,7 @@ class OpenAIServingPooling(OpenAIServing):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
request_id_item
,
request
_prompt
s
[
i
]
,
engine
_prompt
,
params
=
pooling_params
,
lora_request
=
lora_request
)
...
...
@@ -272,3 +268,10 @@ class OpenAIServingPooling(OpenAIServing):
data
=
items
,
usage
=
usage
,
)
def
_build_render_config
(
self
,
request
:
PoolingCompletionRequest
)
->
RenderConfig
:
return
RenderConfig
(
max_length
=
self
.
max_model_len
,
truncate_prompt_tokens
=
request
.
truncate_prompt_tokens
,
add_special_tokens
=
request
.
add_special_tokens
)
vllm/entrypoints/openai/serving_responses.py
View file @
38d80967
...
...
@@ -4,6 +4,8 @@
import
asyncio
import
json
import
time
import
uuid
from
collections
import
deque
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
,
Sequence
from
contextlib
import
AsyncExitStack
from
copy
import
copy
...
...
@@ -24,7 +26,8 @@ from openai.types.responses import (ResponseCreatedEvent,
ResponseOutputMessage
,
ResponseOutputText
,
ResponseReasoningItem
,
ResponseReasoningTextDeltaEvent
,
ResponseReasoningTextDoneEvent
)
ResponseReasoningTextDoneEvent
,
response_text_delta_event
)
from
openai.types.responses.response_output_text
import
(
Logprob
,
LogprobTopLogprob
)
# yapf: enable
...
...
@@ -41,12 +44,13 @@ from vllm.entrypoints.context import (ConversationContext, HarmonyContext,
SimpleContext
,
StreamingHarmonyContext
)
from
vllm.entrypoints.harmony_utils
import
(
get_developer_message
,
get_stop_tokens_for_assistant_actions
,
get_system_message
,
get_user_message
,
parse_output_message
,
parse_remaining_state
,
parse_response_input
,
render_for_completion
)
get_system_message
,
get_user_message
,
has_custom_tools
,
parse_output_message
,
parse_remaining_state
,
parse_response_input
,
render_for_completion
)
from
vllm.entrypoints.logger
import
RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.entrypoints.openai.protocol
import
(
ErrorResponse
,
from
vllm.entrypoints.openai.protocol
import
(
DeltaMessage
,
ErrorResponse
,
InputTokensDetails
,
OutputTokensDetails
,
RequestResponseMetadata
,
...
...
@@ -55,14 +59,14 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
# yapf: enable
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.tool_server
import
MCPToolServer
,
ToolServer
from
vllm.entrypoints.tool_server
import
ToolServer
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
SampleLogprobs
from
vllm.outputs
import
CompletionOutput
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
Logprob
as
SampleLogprob
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
random_uuid
...
...
@@ -168,6 +172,11 @@ class OpenAIServingResponses(OpenAIServing):
# never remove messages from the store.
self
.
msg_store
:
dict
[
str
,
list
[
ChatCompletionMessageParam
]]
=
{}
# HACK(wuhang): This is a hack. We should use a better store.
# FIXME: If enable_store=True, this may cause a memory leak since we
# never remove events from the store.
self
.
event_store
:
dict
[
str
,
tuple
[
deque
[
str
],
asyncio
.
Event
]]
=
{}
self
.
background_tasks
:
dict
[
str
,
asyncio
.
Task
]
=
{}
self
.
tool_server
=
tool_server
...
...
@@ -228,7 +237,7 @@ class OpenAIServingResponses(OpenAIServing):
try
:
lora_request
=
self
.
_maybe_get_adapters
(
request
)
model_name
=
self
.
_get_
model_name
(
request
.
model
,
lora_request
)
model_name
=
self
.
models
.
model_name
(
lora_request
)
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
lora_request
)
if
self
.
use_harmony
:
...
...
@@ -249,15 +258,6 @@ class OpenAIServingResponses(OpenAIServing):
if
raw_request
:
raw_request
.
state
.
request_metadata
=
request_metadata
if
self
.
tool_server
is
not
None
and
isinstance
(
self
.
tool_server
,
MCPToolServer
)
and
request
.
stream
and
request
.
tools
and
any
(
tool
.
type
in
[
"web_search_preview"
,
"code_interpreter"
]
for
tool
in
request
.
tools
):
return
self
.
create_error_response
(
"MCP tool server is not supported in background mode and "
"streaming mode"
)
# Schedule the request and get the result generator.
generators
:
list
[
AsyncGenerator
[
ConversationContext
,
None
]]
=
[]
...
...
@@ -267,6 +267,8 @@ class OpenAIServingResponses(OpenAIServing):
builtin_tool_list
.
append
(
"browser"
)
if
self
.
tool_server
.
has_tool
(
"python"
):
builtin_tool_list
.
append
(
"python"
)
if
self
.
tool_server
.
has_tool
(
"container"
):
builtin_tool_list
.
append
(
"container"
)
if
self
.
tool_server
is
not
None
:
available_tools
=
builtin_tool_list
...
...
@@ -329,25 +331,44 @@ class OpenAIServingResponses(OpenAIServing):
self
.
response_store
[
response
.
id
]
=
response
# Run the request in the background.
task
=
asyncio
.
create_task
(
self
.
_run_background_request
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
,
),
name
=
f
"create_
{
response
.
id
}
"
,
)
if
request
.
stream
:
task
=
asyncio
.
create_task
(
self
.
_run_background_request_stream
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
,
),
name
=
f
"create_
{
request
.
request_id
}
"
,
)
else
:
task
=
asyncio
.
create_task
(
self
.
_run_background_request
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
,
),
name
=
f
"create_
{
response
.
id
}
"
,
)
# For cleanup.
response_id
=
response
.
id
self
.
background_tasks
[
response_id
]
=
task
task
.
add_done_callback
(
lambda
_
:
self
.
background_tasks
.
pop
(
response_id
,
None
))
if
request
.
stream
:
return
self
.
responses_background_stream_generator
(
request
.
request_id
)
return
response
if
request
.
stream
:
...
...
@@ -430,7 +451,8 @@ class OpenAIServingResponses(OpenAIServing):
async
with
AsyncExitStack
()
as
exit_stack
:
try
:
await
context
.
init_tool_sessions
(
self
.
tool_server
,
exit_stack
)
await
context
.
init_tool_sessions
(
self
.
tool_server
,
exit_stack
,
request
.
request_id
)
async
for
_
in
result_generator
:
pass
except
asyncio
.
CancelledError
:
...
...
@@ -442,11 +464,7 @@ class OpenAIServingResponses(OpenAIServing):
if
self
.
use_harmony
:
assert
isinstance
(
context
,
HarmonyContext
)
output
=
self
.
_make_response_output_items_with_harmony
(
context
)
# TODO: these are all 0 for now!
num_prompt_tokens
=
context
.
num_prompt_tokens
num_generated_tokens
=
context
.
num_output_tokens
num_cached_tokens
=
context
.
num_cached_tokens
num_reasoning_tokens
=
context
.
num_reasoning_tokens
num_tool_output_tokens
=
context
.
num_tool_output_tokens
else
:
assert
isinstance
(
context
,
SimpleContext
)
final_res
=
context
.
last_output
...
...
@@ -459,10 +477,13 @@ class OpenAIServingResponses(OpenAIServing):
# Calculate usage.
assert
final_res
.
prompt_token_ids
is
not
None
num_prompt_tokens
=
len
(
final_res
.
prompt_token_ids
)
num_generated_tokens
=
len
(
final_output
.
token_ids
)
num_cached_tokens
=
final_res
.
num_cached_tokens
num_reasoning_tokens
=
0
num_tool_output_tokens
=
0
assert
isinstance
(
context
,
(
SimpleContext
,
HarmonyContext
))
num_prompt_tokens
=
context
.
num_prompt_tokens
num_generated_tokens
=
context
.
num_output_tokens
num_cached_tokens
=
context
.
num_cached_tokens
num_reasoning_tokens
=
context
.
num_reasoning_tokens
usage
=
ResponseUsage
(
input_tokens
=
num_prompt_tokens
,
...
...
@@ -471,7 +492,8 @@ class OpenAIServingResponses(OpenAIServing):
input_tokens_details
=
InputTokensDetails
(
cached_tokens
=
num_cached_tokens
),
output_tokens_details
=
OutputTokensDetails
(
reasoning_tokens
=
num_reasoning_tokens
),
reasoning_tokens
=
num_reasoning_tokens
,
tool_output_tokens
=
num_tool_output_tokens
),
)
response
=
ResponsesResponse
.
from_request
(
request
,
...
...
@@ -537,6 +559,28 @@ class OpenAIServingResponses(OpenAIServing):
))
return
out
def
_create_stream_response_logprobs
(
self
,
token_ids
:
Sequence
[
int
],
logprobs
:
Optional
[
SampleLogprobs
],
tokenizer
:
AnyTokenizer
,
top_logprobs
:
Optional
[
int
]
=
None
)
->
list
[
response_text_delta_event
.
Logprob
]:
lgs
=
self
.
_create_response_logprobs
(
token_ids
=
token_ids
,
logprobs
=
logprobs
,
tokenizer
=
tokenizer
,
top_logprobs
=
top_logprobs
)
return
[
response_text_delta_event
.
Logprob
(
token
=
lg
.
token
,
logprob
=
lg
.
logprob
,
top_logprobs
=
[
response_text_delta_event
.
LogprobTopLogprob
(
token
=
tl
.
token
,
logprob
=
tl
.
logprob
)
for
tl
in
lg
.
top_logprobs
])
for
lg
in
lgs
]
def
_make_response_output_items
(
self
,
request
:
ResponsesRequest
,
...
...
@@ -670,13 +714,21 @@ class OpenAIServingResponses(OpenAIServing):
# New conversation.
reasoning_effort
=
(
request
.
reasoning
.
effort
if
request
.
reasoning
else
None
)
# Temporary: OpenAI types doesn't have container tool
# so we used MCP to cover that, up for change
tool_types
=
[
tool
.
type
for
tool
in
request
.
tools
]
if
envs
.
VLLM_GPT_OSS_USE_CONTAINER_TOOL
:
tool_types
.
append
(
"container"
)
enable_browser
=
(
"web_search_preview"
in
tool_types
and
self
.
tool_server
is
not
None
and
self
.
tool_server
.
has_tool
(
"browser"
))
enable_code_interpreter
=
(
"code_interpreter"
in
tool_types
and
self
.
tool_server
is
not
None
and
self
.
tool_server
.
has_tool
(
"python"
))
enable_container
=
(
"container"
in
tool_types
and
self
.
tool_server
is
not
None
and
self
.
tool_server
.
has_tool
(
"container"
))
with_custom_tools
=
has_custom_tools
(
tool_types
)
sys_msg
=
get_system_message
(
reasoning_effort
=
reasoning_effort
,
browser_description
=
self
.
tool_server
.
get_tool_description
(
...
...
@@ -685,11 +737,17 @@ class OpenAIServingResponses(OpenAIServing):
python_description
=
self
.
tool_server
.
get_tool_description
(
"python"
)
if
enable_code_interpreter
and
self
.
tool_server
is
not
None
else
None
,
container_description
=
self
.
tool_server
.
get_tool_description
(
"container"
)
if
enable_container
and
self
.
tool_server
is
not
None
else
None
,
instructions
=
request
.
instructions
,
with_custom_tools
=
with_custom_tools
,
)
messages
.
append
(
sys_msg
)
dev_msg
=
get_developer_message
(
request
.
instructions
,
request
.
tools
)
messages
.
append
(
dev_msg
)
if
with_custom_tools
:
dev_msg
=
get_developer_message
(
instructions
=
request
.
instructions
,
tools
=
request
.
tools
)
messages
.
append
(
dev_msg
)
else
:
# Continue the previous conversation.
# FIXME(woosuk): Currently, request params like reasoning and
...
...
@@ -717,7 +775,7 @@ class OpenAIServingResponses(OpenAIServing):
prev_msgs
.
append
(
msg
)
messages
.
extend
(
prev_msgs
)
# Append the new input.
# Reponses API supports simple text inputs without chat format.
# Re
s
ponses API supports simple text inputs without chat format.
if
isinstance
(
request
.
input
,
str
):
messages
.
append
(
get_user_message
(
request
.
input
))
else
:
...
...
@@ -728,7 +786,7 @@ class OpenAIServingResponses(OpenAIServing):
for
response_msg
in
request
.
input
:
messages
.
append
(
parse_response_input
(
response_msg
,
prev_outputs
))
# User passes in a
a
tool call request and its output. We need
# User passes in a tool call request and its output. We need
# to add the tool call request to prev_outputs so that the
# parse_response_input can find the tool call request when
# parsing the tool call output.
...
...
@@ -736,6 +794,40 @@ class OpenAIServingResponses(OpenAIServing):
prev_outputs
.
append
(
response_msg
)
return
messages
async
def
_run_background_request_stream
(
self
,
request
:
ResponsesRequest
,
*
args
,
**
kwargs
,
):
event_deque
:
deque
[
str
]
=
deque
()
new_event_signal
=
asyncio
.
Event
()
self
.
event_store
[
request
.
request_id
]
=
(
event_deque
,
new_event_signal
)
response
=
None
try
:
generator
=
self
.
responses_stream_generator
(
request
,
*
args
,
**
kwargs
)
async
for
event
in
generator
:
event_deque
.
append
(
event
)
new_event_signal
.
set
()
# Signal new event available
except
Exception
as
e
:
logger
.
exception
(
"Background request failed for %s"
,
request
.
request_id
)
response
=
self
.
create_error_response
(
str
(
e
))
finally
:
# Mark as finished with a special marker
event_deque
.
append
(
"__STREAM_END__"
)
new_event_signal
.
set
()
if
response
is
not
None
and
isinstance
(
response
,
ErrorResponse
):
# If the request has failed, update the status to "failed".
response_id
=
request
.
request_id
async
with
self
.
response_store_lock
:
stored_response
=
self
.
response_store
.
get
(
response_id
)
assert
stored_response
is
not
None
if
stored_response
.
status
not
in
(
"completed"
,
"cancelled"
):
stored_response
.
status
=
"failed"
async
def
_run_background_request
(
self
,
request
:
ResponsesRequest
,
...
...
@@ -759,9 +851,36 @@ class OpenAIServingResponses(OpenAIServing):
if
stored_response
.
status
not
in
(
"completed"
,
"cancelled"
):
stored_response
.
status
=
"failed"
async
def
responses_background_stream_generator
(
self
,
response_id
:
str
,
starting_after
:
Optional
[
int
]
=
None
,
):
if
response_id
not
in
self
.
event_store
:
raise
ValueError
(
f
"Unknown response_id:
{
response_id
}
"
)
event_deque
,
new_event_signal
=
self
.
event_store
[
response_id
]
start_index
=
0
if
starting_after
is
None
else
starting_after
+
1
current_index
=
start_index
while
True
:
new_event_signal
.
clear
()
# Yield existing events from start_index
while
current_index
<
len
(
event_deque
):
event
=
event_deque
[
current_index
]
if
event
==
"__STREAM_END__"
:
return
yield
event
current_index
+=
1
await
new_event_signal
.
wait
()
async
def
retrieve_responses
(
self
,
response_id
:
str
,
starting_after
:
Optional
[
int
],
stream
:
Optional
[
bool
],
)
->
Union
[
ErrorResponse
,
ResponsesResponse
]:
if
not
response_id
.
startswith
(
"resp_"
):
return
self
.
_make_invalid_id_error
(
response_id
)
...
...
@@ -771,6 +890,12 @@ class OpenAIServingResponses(OpenAIServing):
if
response
is
None
:
return
self
.
_make_not_found_error
(
response_id
)
if
stream
:
return
self
.
responses_background_stream_generator
(
response_id
,
starting_after
,
)
return
response
async
def
cancel_responses
(
...
...
@@ -829,7 +954,7 @@ class OpenAIServingResponses(OpenAIServing):
status_code
=
HTTPStatus
.
BAD_REQUEST
,
)
async
def
_process_streaming_events
(
async
def
_process_
simple_
streaming_events
(
self
,
request
:
ResponsesRequest
,
sampling_params
:
SamplingParams
,
...
...
@@ -839,47 +964,292 @@ class OpenAIServingResponses(OpenAIServing):
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
created_time
:
int
,
_send_event
:
Callable
[[
BaseModel
],
str
],
)
->
AsyncGenerator
[
str
,
None
]:
sequence_number
=
0
def
_send_event
(
event
:
BaseModel
):
nonlocal
sequence_number
# Set sequence_number if the event has this attribute
if
hasattr
(
event
,
'sequence_number'
):
event
.
sequence_number
=
sequence_number
sequence_number
+=
1
# Get event type from the event's type field if it exists
event_type
=
getattr
(
event
,
'type'
,
'unknown'
)
return
(
f
"event:
{
event_type
}
\n
"
f
"data:
{
event
.
model_dump_json
(
indent
=
None
)
}
\n\n
"
)
current_content_index
=
0
current_output_index
=
0
current_item_id
=
""
reasoning_parser
=
None
if
self
.
reasoning_parser
:
reasoning_parser
=
self
.
reasoning_parser
(
tokenizer
)
previous_text
=
""
previous_token_ids
:
list
[
int
]
=
[]
first_delta_sent
=
False
previous_delta_messages
:
list
[
DeltaMessage
]
=
[]
async
for
ctx
in
result_generator
:
assert
isinstance
(
ctx
,
SimpleContext
)
if
ctx
.
last_output
is
None
:
continue
if
ctx
.
last_output
.
outputs
:
output
=
ctx
.
last_output
.
outputs
[
0
]
if
reasoning_parser
:
delta_message
=
\
reasoning_parser
.
extract_reasoning_content_streaming
(
previous_text
=
previous_text
,
current_text
=
previous_text
+
output
.
text
,
delta_text
=
output
.
text
,
previous_token_ids
=
previous_token_ids
,
current_token_ids
=
previous_token_ids
+
output
.
token_ids
,
delta_token_ids
=
output
.
token_ids
,
)
else
:
delta_message
=
DeltaMessage
(
content
=
output
.
text
,
)
previous_text
+=
output
.
text
previous_token_ids
+=
output
.
token_ids
if
not
delta_message
:
continue
if
not
first_delta_sent
:
current_item_id
=
str
(
uuid
.
uuid4
())
if
delta_message
.
reasoning_content
:
yield
_send_event
(
openai_responses_types
.
ResponseOutputItemAddedEvent
(
type
=
"response.output_item.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
openai_responses_types
.
ResponseReasoningItem
(
type
=
"reasoning"
,
id
=
current_item_id
,
summary
=
[],
status
=
"in_progress"
,
),
))
else
:
yield
_send_event
(
openai_responses_types
.
ResponseOutputItemAddedEvent
(
type
=
"response.output_item.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
openai_responses_types
.
ResponseOutputMessage
(
id
=
current_item_id
,
type
=
"message"
,
role
=
"assistant"
,
content
=
[],
status
=
"in_progress"
,
),
))
yield
_send_event
(
openai_responses_types
.
ResponseContentPartAddedEvent
(
type
=
"response.content_part.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
content_index
=
current_content_index
,
part
=
openai_responses_types
.
ResponseOutputText
(
type
=
"output_text"
,
text
=
""
,
annotations
=
[],
logprobs
=
[],
),
))
current_content_index
+=
1
first_delta_sent
=
True
# todo(kebe7jun) tool call support
# check delta message and previous delta message are
# same as content or reasoning content
if
(
previous_delta_messages
and
previous_delta_messages
[
-
1
].
reasoning_content
is
not
None
and
delta_message
.
content
is
not
None
):
# from reasoning to normal content, send done
# event for reasoning
reason_content
=
''
.
join
(
pm
.
reasoning_content
for
pm
in
previous_delta_messages
if
pm
.
reasoning_content
is
not
None
)
yield
_send_event
(
ResponseReasoningTextDoneEvent
(
type
=
"response.reasoning_text.done"
,
item_id
=
current_item_id
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
content_index
=
current_content_index
,
text
=
reason_content
,
))
current_content_index
=
0
reasoning_item
=
ResponseReasoningItem
(
type
=
"reasoning"
,
content
=
[
ResponseReasoningTextContent
(
text
=
reason_content
,
type
=
"reasoning_text"
,
),
],
status
=
"completed"
,
id
=
current_item_id
,
summary
=
[],
)
yield
_send_event
(
ResponseOutputItemDoneEvent
(
type
=
"response.output_item.done"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
reasoning_item
,
))
yield
_send_event
(
openai_responses_types
.
ResponseOutputItemAddedEvent
(
type
=
"response.output_item.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
openai_responses_types
.
ResponseOutputMessage
(
id
=
current_item_id
,
type
=
"message"
,
role
=
"assistant"
,
content
=
[],
status
=
"in_progress"
,
),
))
current_output_index
+=
1
current_item_id
=
str
(
uuid
.
uuid4
())
yield
_send_event
(
openai_responses_types
.
ResponseContentPartAddedEvent
(
type
=
"response.content_part.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
content_index
=
current_content_index
,
part
=
openai_responses_types
.
ResponseOutputText
(
type
=
"output_text"
,
text
=
""
,
annotations
=
[],
logprobs
=
[],
),
))
current_content_index
+=
1
# reset previous delta messages
previous_delta_messages
=
[]
if
delta_message
.
reasoning_content
is
not
None
:
yield
_send_event
(
ResponseReasoningTextDeltaEvent
(
type
=
"response.reasoning_text.delta"
,
sequence_number
=-
1
,
content_index
=
current_content_index
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
delta
=
delta_message
.
reasoning_content
,
))
elif
delta_message
.
content
is
not
None
:
yield
_send_event
(
openai_responses_types
.
ResponseTextDeltaEvent
(
type
=
"response.output_text.delta"
,
sequence_number
=-
1
,
content_index
=
current_content_index
,
output_index
=
current_output_index
,
item_id
=
current_item_id
,
delta
=
delta_message
.
content
,
logprobs
=
self
.
_create_stream_response_logprobs
(
token_ids
=
output
.
token_ids
,
logprobs
=
output
.
logprobs
,
tokenizer
=
tokenizer
,
top_logprobs
=
request
.
top_logprobs
,
)
if
request
.
is_include_output_logprobs
()
else
[],
))
current_content_index
+=
1
previous_delta_messages
.
append
(
delta_message
)
if
previous_delta_messages
:
if
previous_delta_messages
[
-
1
].
reasoning_content
is
not
None
:
reason_content
=
''
.
join
(
pm
.
reasoning_content
for
pm
in
previous_delta_messages
if
pm
.
reasoning_content
is
not
None
)
yield
_send_event
(
ResponseReasoningTextDoneEvent
(
type
=
"response.reasoning_text.done"
,
item_id
=
current_item_id
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
content_index
=
current_content_index
,
text
=
reason_content
,
))
current_content_index
+=
1
reasoning_item
=
ResponseReasoningItem
(
type
=
"reasoning"
,
content
=
[
ResponseReasoningTextContent
(
text
=
reason_content
,
type
=
"reasoning_text"
,
),
],
status
=
"completed"
,
id
=
current_item_id
,
summary
=
[],
)
yield
_send_event
(
ResponseOutputItemDoneEvent
(
type
=
"response.output_item.done"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
reasoning_item
,
))
elif
previous_delta_messages
[
-
1
].
content
is
not
None
:
final_content
=
''
.
join
(
pm
.
content
for
pm
in
previous_delta_messages
if
pm
.
content
is
not
None
)
yield
_send_event
(
openai_responses_types
.
ResponseTextDoneEvent
(
type
=
"response.output_text.done"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
content_index
=
current_content_index
,
text
=
final_content
,
logprobs
=
[],
item_id
=
current_item_id
,
))
current_content_index
+=
1
part
=
ResponseOutputText
(
text
=
final_content
,
type
=
"output_text"
,
annotations
=
[],
)
yield
_send_event
(
openai_responses_types
.
ResponseContentPartDoneEvent
(
type
=
"response.content_part.done"
,
sequence_number
=-
1
,
item_id
=
current_item_id
,
output_index
=
current_output_index
,
content_index
=
current_content_index
,
part
=
part
,
))
current_content_index
+=
1
item
=
ResponseOutputMessage
(
type
=
"message"
,
role
=
"assistant"
,
content
=
[
part
,
],
status
=
"completed"
,
id
=
current_item_id
,
summary
=
[],
)
yield
_send_event
(
ResponseOutputItemDoneEvent
(
type
=
"response.output_item.done"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
item
,
))
async
def
_process_harmony_streaming_events
(
self
,
request
:
ResponsesRequest
,
sampling_params
:
SamplingParams
,
result_generator
:
AsyncIterator
[
Optional
[
ConversationContext
]],
context
:
ConversationContext
,
model_name
:
str
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
created_time
:
int
,
_send_event
:
Callable
[[
BaseModel
],
str
],
)
->
AsyncGenerator
[
str
,
None
]:
current_content_index
=
0
# FIXME: this number is never changed
current_output_index
=
0
current_item_id
=
""
# FIXME: this number is never changed
sent_output_item_added
=
False
initial_response
=
ResponsesResponse
.
from_request
(
request
,
sampling_params
,
model_name
=
model_name
,
created_time
=
created_time
,
output
=
[],
status
=
"in_progress"
,
usage
=
None
,
).
model_dump
()
yield
_send_event
(
ResponseCreatedEvent
(
type
=
"response.created"
,
sequence_number
=-
1
,
response
=
initial_response
,
))
yield
_send_event
(
ResponseInProgressEvent
(
type
=
"response.in_progress"
,
sequence_number
=-
1
,
response
=
initial_response
,
))
async
for
ctx
in
result_generator
:
assert
isinstance
(
ctx
,
StreamingHarmonyContext
)
...
...
@@ -1229,29 +1599,6 @@ class OpenAIServingResponses(OpenAIServing):
),
))
async
def
empty_async_generator
():
# A hack to trick Python to think this is a generator but in fact
# it immediately returns.
if
False
:
yield
final_response
=
await
self
.
responses_full_generator
(
request
,
sampling_params
,
empty_async_generator
(),
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
=
created_time
,
)
yield
_send_event
(
openai_responses_types
.
ResponseCompletedEvent
(
type
=
"response.completed"
,
sequence_number
=-
1
,
response
=
final_response
.
model_dump
(),
))
async
def
responses_stream_generator
(
self
,
request
:
ResponsesRequest
,
...
...
@@ -1266,16 +1613,78 @@ class OpenAIServingResponses(OpenAIServing):
# TODO:
# 1. Handle disconnect
if
not
isinstance
(
context
,
StreamingHarmonyContext
):
raise
NotImplementedError
(
"Streaming is not supported for responses API without Harmony."
)
created_time
=
created_time
or
int
(
time
.
time
())
sequence_number
=
0
def
_send_event
(
event
:
BaseModel
):
nonlocal
sequence_number
# Set sequence_number if the event has this attribute
if
hasattr
(
event
,
'sequence_number'
):
event
.
sequence_number
=
sequence_number
sequence_number
+=
1
# Get event type from the event's type field if it exists
event_type
=
getattr
(
event
,
'type'
,
'unknown'
)
return
(
f
"event:
{
event_type
}
\n
"
f
"data:
{
event
.
model_dump_json
(
indent
=
None
)
}
\n\n
"
)
async
with
AsyncExitStack
()
as
exit_stack
:
await
context
.
init_tool_sessions
(
self
.
tool_server
,
exit_stack
)
async
for
event_data
in
self
.
_process_streaming_events
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
):
processer
=
None
if
self
.
use_harmony
:
await
context
.
init_tool_sessions
(
self
.
tool_server
,
exit_stack
,
request
.
request_id
)
processer
=
self
.
_process_harmony_streaming_events
else
:
processer
=
self
.
_process_simple_streaming_events
initial_response
=
ResponsesResponse
.
from_request
(
request
,
sampling_params
,
model_name
=
model_name
,
created_time
=
created_time
,
output
=
[],
status
=
"in_progress"
,
usage
=
None
,
).
model_dump
()
yield
_send_event
(
ResponseCreatedEvent
(
type
=
"response.created"
,
sequence_number
=-
1
,
response
=
initial_response
,
))
yield
_send_event
(
ResponseInProgressEvent
(
type
=
"response.in_progress"
,
sequence_number
=-
1
,
response
=
initial_response
,
))
async
for
event_data
in
processer
(
request
,
sampling_params
,
result_generator
,
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
,
_send_event
):
yield
event_data
async
def
empty_async_generator
():
# A hack to trick Python to think this is a generator but
# in fact it immediately returns.
if
False
:
yield
final_response
=
await
self
.
responses_full_generator
(
request
,
sampling_params
,
empty_async_generator
(),
context
,
model_name
,
tokenizer
,
request_metadata
,
created_time
=
created_time
,
)
yield
_send_event
(
openai_responses_types
.
ResponseCompletedEvent
(
type
=
"response.completed"
,
sequence_number
=-
1
,
response
=
final_response
.
model_dump
(),
))
vllm/entrypoints/openai/serving_score.py
View file @
38d80967
...
...
@@ -353,7 +353,7 @@ class ServingScores(OpenAIServing):
final_res_batch
,
request_id
,
created_time
,
self
.
_get_
model_name
(
request
.
model
),
self
.
models
.
model_name
(),
)
except
asyncio
.
CancelledError
:
return
self
.
create_error_response
(
"Client disconnected"
)
...
...
@@ -399,7 +399,7 @@ class ServingScores(OpenAIServing):
return
self
.
request_output_to_rerank_response
(
final_res_batch
,
request_id
,
self
.
_get_
model_name
(
request
.
model
),
self
.
models
.
model_name
(),
documents
,
top_n
,
)
...
...
vllm/entrypoints/openai/serving_tokenization.py
View file @
38d80967
...
...
@@ -22,6 +22,7 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
# yapf: enable
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
...
...
@@ -65,13 +66,14 @@ class OpenAIServingTokenization(OpenAIServing):
lora_request
=
self
.
_maybe_get_adapters
(
request
)
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
(
lora_request
)
renderer
=
self
.
_get_renderer
(
tokenizer
)
if
isinstance
(
request
,
TokenizeChatRequest
):
tool_dicts
=
(
None
if
request
.
tools
is
None
else
[
tool
.
model_dump
()
for
tool
in
request
.
tools
])
(
_
,
request_prompts
,
_
,
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
request
,
...
...
@@ -87,21 +89,18 @@ class OpenAIServingTokenization(OpenAIServing):
add_special_tokens
=
request
.
add_special_tokens
,
)
else
:
(
request_prompts
,
engine_prompts
)
=
await
self
.
_preprocess_completion
(
request
,
tokenizer
,
request
.
prompt
,
add_special_tokens
=
request
.
add_special_tokens
,
)
engine_prompts
=
await
renderer
.
render_prompt
(
prompt_or_prompts
=
request
.
prompt
,
config
=
self
.
_build_render_config
(
request
),
)
except
(
ValueError
,
TypeError
,
jinja2
.
TemplateError
)
as
e
:
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
f
"
{
e
}
{
e
.
__cause__
}
"
)
input_ids
:
list
[
int
]
=
[]
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
)
:
for
engine_prompt
in
engine_prompts
:
self
.
_log_inputs
(
request_id
,
request
_prompt
s
[
i
]
,
engine
_prompt
,
params
=
None
,
lora_request
=
lora_request
)
...
...
@@ -158,6 +157,9 @@ class OpenAIServingTokenization(OpenAIServing):
return
self
.
create_error_response
(
f
"Failed to get tokenizer info:
{
str
(
e
)
}
"
)
def
_build_render_config
(
self
,
request
:
TokenizeRequest
)
->
RenderConfig
:
return
RenderConfig
(
add_special_tokens
=
request
.
add_special_tokens
)
@
dataclass
class
TokenizerInfo
:
...
...
vllm/entrypoints/openai/speech_to_text.py
View file @
38d80967
...
...
@@ -89,6 +89,9 @@ class OpenAISpeechToText(OpenAIServing):
)
->
tuple
[
list
[
PromptType
],
float
]:
# Validate request
language
=
self
.
model_cls
.
validate_language
(
request
.
language
)
# Skip to_language validation to avoid extra logging for Whisper.
to_language
=
self
.
model_cls
.
validate_language
(
request
.
to_language
)
\
if
request
.
to_language
else
None
if
len
(
audio_data
)
/
1024
**
2
>
self
.
max_audio_filesize_mb
:
raise
ValueError
(
"Maximum file size exceeded."
)
...
...
@@ -112,7 +115,9 @@ class OpenAISpeechToText(OpenAIServing):
model_config
=
self
.
model_config
,
language
=
language
,
task_type
=
self
.
task_type
,
request_prompt
=
request
.
prompt
)
request_prompt
=
request
.
prompt
,
to_language
=
to_language
,
)
prompts
.
append
(
prompt
)
return
prompts
,
duration
...
...
vllm/entrypoints/openai/tool_parsers/__init__.py
View file @
38d80967
...
...
@@ -16,6 +16,7 @@ from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
from
.llama_tool_parser
import
Llama3JsonToolParser
from
.minimax_tool_parser
import
MinimaxToolParser
from
.mistral_tool_parser
import
MistralToolParser
from
.openai_tool_parser
import
OpenAIToolParser
from
.phi4mini_tool_parser
import
Phi4MiniJsonToolParser
from
.pythonic_tool_parser
import
PythonicToolParser
from
.qwen3coder_tool_parser
import
Qwen3CoderToolParser
...
...
@@ -46,4 +47,5 @@ __all__ = [
"Qwen3CoderToolParser"
,
"SeedOssToolParser"
,
"Step3ToolParser"
,
"OpenAIToolParser"
,
]
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
View file @
38d80967
...
...
@@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
self
,
request
:
ChatCompletionRequest
)
->
ChatCompletionRequest
:
if
request
.
tools
and
request
.
tool_choice
!=
'none'
:
# do not skip special tokens because internlm use the special
# tokens to indicate
d
the start and end of the tool calls
# tokens to indicate the start and end of the tool calls
# information.
request
.
skip_special_tokens
=
False
return
request
...
...
@@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
if
'<|action_start|>'
not
in
current_text
:
self
.
position
=
len
(
current_text
)
return
DeltaMessage
(
content
=
delta_text
)
# if the tool call is sen
ded
, return a empty delta message
# to make sure the finish_reason will be sen
d
correctly.
# if the tool call is sen
t
, return a
n
empty delta message
# to make sure the finish_reason will be sen
t
correctly.
if
self
.
current_tool_id
>
0
:
return
DeltaMessage
(
content
=
''
)
...
...
@@ -89,7 +89,7 @@ class Internlm2ToolParser(ToolParser):
try
:
parsable_arr
=
action
# tool calls are generated in an object in inernlm2
# tool calls are generated in an object in in
t
ernlm2
# it's not support parallel tool calls
try
:
tool_call_arr
:
dict
=
partial_json_parser
.
loads
(
...
...
vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
View file @
38d80967
...
...
@@ -176,7 +176,7 @@ class Llama4PythonicToolParser(ToolParser):
index
]
+=
delta
.
function
.
arguments
# HACK: serving_chat.py inspects the internal state of tool parsers
# when determining it
'
s final streaming delta, automatically
# when determining its final streaming delta, automatically
# adding autocompleted JSON.
# These two lines avoid that nonsense while ensuring finish_reason
# is set to tool_calls when at least one tool is called.
...
...
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
View file @
38d80967
...
...
@@ -143,7 +143,7 @@ class MistralToolParser(ToolParser):
except
json
.
JSONDecodeError
:
# use a regex to find the part corresponding to the tool call.
# NOTE: This use case should not happen if the model is trained
# correctly. It's a easy possible fix so it's included, but
# correctly. It's a
n
easy possible fix so it's included, but
# can be brittle for very complex / highly nested tool calls
raw_tool_call
=
self
.
tool_call_regex
.
findall
(
tool_content
)[
0
]
function_call_arr
=
json
.
loads
(
raw_tool_call
)
...
...
vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
from
collections.abc
import
Sequence
from
typing
import
TYPE_CHECKING
from
vllm.entrypoints.harmony_utils
import
parse_output_into_messages
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
,
ExtractedToolCallInformation
,
FunctionCall
,
ToolCall
)
from
vllm.entrypoints.openai.tool_parsers.abstract_tool_parser
import
(
ToolParser
,
ToolParserManager
)
if
TYPE_CHECKING
:
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
@
ToolParserManager
.
register_module
(
"openai"
)
class
OpenAIToolParser
(
ToolParser
):
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
):
super
().
__init__
(
tokenizer
)
def
extract_tool_calls
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
,
token_ids
:
Sequence
[
int
]
|
None
=
None
,
)
->
ExtractedToolCallInformation
:
if
token_ids
is
None
:
raise
NotImplementedError
(
"OpenAIToolParser requires token IDs and does not support text-based extraction."
# noqa: E501
)
parser
=
parse_output_into_messages
(
token_ids
)
tool_calls
=
[]
final_content
=
None
if
len
(
parser
.
messages
)
>
0
:
for
msg
in
parser
.
messages
:
if
msg
.
recipient
and
msg
.
recipient
.
startswith
(
"functions."
):
tool_calls
.
append
(
ToolCall
(
type
=
"function"
,
function
=
FunctionCall
(
name
=
msg
.
recipient
.
split
(
"functions."
)[
1
],
arguments
=
msg
.
content
[
0
].
text
,
),
))
elif
msg
.
channel
==
"final"
:
final_content
=
msg
.
content
[
0
].
text
return
ExtractedToolCallInformation
(
tools_called
=
len
(
tool_calls
)
>
0
,
tool_calls
=
tool_calls
,
content
=
final_content
,
)
def
extract_tool_calls_streaming
(
self
,
previous_text
:
str
,
current_text
:
str
,
delta_text
:
str
,
previous_token_ids
:
Sequence
[
int
],
current_token_ids
:
Sequence
[
int
],
delta_token_ids
:
Sequence
[
int
],
request
:
ChatCompletionRequest
,
)
->
DeltaMessage
|
None
:
raise
NotImplementedError
(
"Not being used, manual parsing in serving_chat.py"
# noqa: E501
)
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
View file @
38d80967
...
...
@@ -165,7 +165,7 @@ class PythonicToolParser(ToolParser):
index
]
+=
delta
.
function
.
arguments
# HACK: serving_chat.py inspects the internal state of tool parsers
# when determining it
'
s final streaming delta, automatically
# when determining its final streaming delta, automatically
# adding autocompleted JSON.
# These two lines avoid that nonsense while ensuring finish_reason
# is set to tool_calls when at least one tool is called.
...
...
vllm/entrypoints/renderer.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
io
from
abc
import
ABC
,
abstractmethod
from
dataclasses
import
dataclass
from
typing
import
Annotated
,
Optional
,
Union
import
pybase64
import
torch
from
pydantic
import
Field
from
vllm.config
import
ModelConfig
from
vllm.inputs.data
import
EmbedsPrompt
as
EngineEmbedsPrompt
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.parse
import
parse_and_batch_prompt
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
AsyncMicrobatchTokenizer
@
dataclass
(
frozen
=
True
)
class
RenderConfig
:
"""Configuration to control how prompts are prepared."""
max_length
:
Optional
[
int
]
=
None
"""Maximum allowable total input token length. If provided,
token inputs longer than this raise ``ValueError``."""
truncate_prompt_tokens
:
Optional
[
int
]
=
None
"""Number of tokens to keep. ``None`` means no truncation.
``0`` yields an empty list (and skips embeds).
``-1`` maps to ``model_config.max_model_len``."""
add_special_tokens
:
Optional
[
bool
]
=
True
"""Whether to add model-specific special tokens during tokenization."""
cache_salt
:
Optional
[
str
]
=
None
"""String to disambiguate prefix cache entries."""
needs_detokenization
:
Optional
[
bool
]
=
False
"""If True, detokenize IDs back to text for inclusion in outputs."""
class
BaseRenderer
(
ABC
):
"""
Base class for unified input processing and rendering.
The Renderer serves as a unified input processor that consolidates
tokenization, chat template formatting, and multimodal input handling
into a single component.
It converts high-level API requests (OpenAI-style JSON) into token IDs and
multimodal features ready for engine consumption.
Key responsibilities:
- Convert text prompts to token sequences with proper special tokens
- Apply chat templates and format conversations
- Handle multimodal inputs (images, audio, etc.) when applicable
- Manage prompt truncation and length validation
- Provide clean separation between API layer and engine core
"""
def
__init__
(
self
,
model_config
:
ModelConfig
,
tokenizer
:
Optional
[
AnyTokenizer
]
=
None
,
):
super
().
__init__
()
self
.
model_config
=
model_config
self
.
tokenizer
=
tokenizer
@
abstractmethod
async
def
render_prompt
(
self
,
*
,
prompt_or_prompts
:
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]],
config
:
"RenderConfig"
,
)
->
list
[
EngineTokensPrompt
]:
"""
Convert text or token inputs into engine-ready TokensPrompt objects.
This method accepts text or token inputs and produces a
list of [`TokensPrompt`][vllm.inputs.data.TokensPrompt] objects
for the engine.
Args:
prompt_or_prompts: One of:
- ``str``: Single text prompt.
- ``list[str]``: Batch of text prompts.
- ``list[int]``: Single pre-tokenized sequence.
- ``list[list[int]]``: Batch of pre-tokenized sequences.
config: Render configuration controlling how prompts are prepared
(e.g., tokenization and length handling).
Returns:
list[EngineTokensPrompt]: Engine-ready token prompts.
Raises:
ValueError: If input formats are invalid or length limits exceeded.
"""
raise
NotImplementedError
@
abstractmethod
async
def
render_prompt_and_embeds
(
self
,
*
,
prompt_or_prompts
:
Optional
[
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]]]
=
None
,
prompt_embeds
:
Optional
[
Union
[
bytes
,
list
[
bytes
]]]
=
None
,
config
:
"RenderConfig"
,
)
->
list
[
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
]]:
"""
Convert text/token and/or base64-encoded embeddings inputs into
engine-ready prompt objects using a unified RenderConfig.
At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be
provided and non-empty. If both are omitted or empty (e.g., empty
string and empty list), a ``ValueError`` is raised.
Args:
prompt_or_prompts: Text or token inputs to include.
prompt_embeds: Base64-encoded bytes (or list thereof) containing a
torch-saved tensor to be used as prompt embeddings.
config: Render configuration controlling how prompts are prepared
(e.g., tokenization and length handling).
Returns:
list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
Engine-ready prompt objects.
Raises:
ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds``
are omitted or empty (decoder prompt cannot be empty), or if
length limits are exceeded.
"""
raise
NotImplementedError
@
classmethod
def
load_prompt_embeds
(
cls
,
prompt_embeds
:
Union
[
bytes
,
list
[
bytes
]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
0
)]]
=
None
,
cache_salt
:
Optional
[
str
]
=
None
,
)
->
list
[
EngineEmbedsPrompt
]:
"""Load and validate base64-encoded embeddings into prompt objects."""
def
_load_and_validate_embed
(
embed
:
bytes
)
->
EngineEmbedsPrompt
:
tensor
=
torch
.
load
(
io
.
BytesIO
(
pybase64
.
b64decode
(
embed
,
validate
=
True
)),
weights_only
=
True
,
map_location
=
torch
.
device
(
"cpu"
),
)
assert
isinstance
(
tensor
,
torch
.
Tensor
)
and
tensor
.
dtype
in
(
torch
.
float32
,
torch
.
bfloat16
,
torch
.
float16
,
)
tensor
=
tensor
.
to_dense
()
if
tensor
.
dim
()
>
2
:
tensor
=
tensor
.
squeeze
(
0
)
assert
tensor
.
dim
()
==
2
if
truncate_prompt_tokens
is
not
None
:
tensor
=
tensor
[
-
truncate_prompt_tokens
:]
embeds_prompt
=
EngineEmbedsPrompt
(
prompt_embeds
=
tensor
)
if
cache_salt
is
not
None
:
embeds_prompt
[
"cache_salt"
]
=
cache_salt
return
embeds_prompt
if
isinstance
(
prompt_embeds
,
list
):
return
[
_load_and_validate_embed
(
embed
)
for
embed
in
prompt_embeds
]
return
[
_load_and_validate_embed
(
prompt_embeds
)]
class
CompletionRenderer
(
BaseRenderer
):
def
__init__
(
self
,
model_config
:
ModelConfig
,
tokenizer
:
Optional
[
AnyTokenizer
]
=
None
,
async_tokenizer_pool
:
Optional
[
dict
[
AnyTokenizer
,
AsyncMicrobatchTokenizer
]]
=
None
,
):
super
().
__init__
(
model_config
,
tokenizer
)
self
.
async_tokenizer_pool
=
async_tokenizer_pool
self
.
async_tokenizer
:
Optional
[
AsyncMicrobatchTokenizer
]
=
None
async
def
render_prompt
(
self
,
*
,
prompt_or_prompts
:
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]],
config
:
"RenderConfig"
,
)
->
list
[
EngineTokensPrompt
]:
"""Implementation of prompt rendering for completion-style requests.
Uses async tokenizer pooling for improved performance. See base class
for detailed parameter documentation.
"""
truncate_prompt_tokens
=
self
.
_validate_and_normalize_truncate_tokens
(
config
.
truncate_prompt_tokens
,
config
.
max_length
)
if
truncate_prompt_tokens
==
0
:
return
[]
# Parse and batch the input prompts
batch_inputs
=
parse_and_batch_prompt
(
prompt_or_prompts
)
tasks
=
[]
for
prompt_input
in
batch_inputs
:
if
prompt_input
[
"is_tokens"
]
is
True
:
# Token input
# Note: detokenization is needed when echo is enabled,
# where the input token IDs are decoded back to text.
task
=
self
.
_maybe_detokenize
(
prompt_input
[
"content"
],
config
.
max_length
,
truncate_prompt_tokens
,
config
.
cache_salt
,
config
.
needs_detokenization
)
else
:
# Text input
task
=
self
.
_tokenize
(
prompt_input
[
"content"
],
config
.
max_length
,
truncate_prompt_tokens
,
config
.
add_special_tokens
,
config
.
cache_salt
)
tasks
.
append
(
task
)
# Wait for all text tokenization to finish
if
tasks
:
tokenized_text_prompts
=
await
asyncio
.
gather
(
*
tasks
)
return
tokenized_text_prompts
return
[]
async
def
render_prompt_and_embeds
(
self
,
*
,
prompt_or_prompts
:
Optional
[
Union
[
str
,
list
[
str
],
list
[
int
],
list
[
list
[
int
]]]]
=
None
,
prompt_embeds
:
Optional
[
Union
[
bytes
,
list
[
bytes
]]]
=
None
,
config
:
"RenderConfig"
,
)
->
list
[
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
]]:
"""
Render text/token prompts and/or precomputed embedding prompts. At
least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
"""
truncate_prompt_tokens
=
self
.
_validate_and_normalize_truncate_tokens
(
config
.
truncate_prompt_tokens
,
config
.
max_length
)
if
truncate_prompt_tokens
==
0
:
return
[]
rendered
:
list
[
Union
[
EngineTokensPrompt
,
EngineEmbedsPrompt
]]
=
[]
if
prompt_embeds
is
not
None
:
rendered
.
extend
(
self
.
load_prompt_embeds
(
prompt_embeds
,
truncate_prompt_tokens
,
config
.
cache_salt
))
if
prompt_or_prompts
is
None
or
prompt_or_prompts
==
""
:
return
rendered
token_prompts
=
await
self
.
render_prompt
(
prompt_or_prompts
=
prompt_or_prompts
,
config
=
config
,
)
rendered
.
extend
(
token_prompts
)
return
rendered
def
_validate_and_normalize_truncate_tokens
(
self
,
truncate_prompt_tokens
:
Optional
[
int
],
max_length
:
Optional
[
int
],
)
->
Optional
[
int
]:
"""Validate and normalize truncate_prompt_tokens parameter."""
if
truncate_prompt_tokens
is
None
:
return
None
if
truncate_prompt_tokens
==
0
:
return
0
if
truncate_prompt_tokens
<
0
:
truncate_prompt_tokens
=
self
.
model_config
.
max_model_len
if
max_length
is
not
None
and
truncate_prompt_tokens
>
max_length
:
raise
ValueError
(
f
"truncate_prompt_tokens (
{
truncate_prompt_tokens
}
) "
f
"cannot be greater than max_length (
{
max_length
}
). "
f
"Please select a smaller truncation size."
)
return
truncate_prompt_tokens
def
_maybe_apply_truncation
(
self
,
token_ids
:
list
[
int
],
truncate_prompt_tokens
:
Optional
[
int
])
->
list
[
int
]:
"""Apply truncation to token sequence."""
if
truncate_prompt_tokens
is
None
:
return
token_ids
if
truncate_prompt_tokens
>=
len
(
token_ids
):
return
token_ids
return
token_ids
[
-
truncate_prompt_tokens
:]
async
def
_tokenize
(
self
,
text
:
str
,
max_length
:
Optional
[
int
],
truncate_prompt_tokens
:
Optional
[
int
],
add_special_tokens
:
Optional
[
bool
],
cache_salt
:
Optional
[
str
],
)
->
EngineTokensPrompt
:
"""Tokenize text input asynchronously."""
async_tokenizer
=
self
.
_get_async_tokenizer
()
# Handle encoder-specific preprocessing
if
(
self
.
model_config
.
encoder_config
is
not
None
and
self
.
model_config
.
encoder_config
.
get
(
"do_lower_case"
,
False
)):
text
=
text
.
lower
()
# Tokenize texts
if
truncate_prompt_tokens
is
None
:
encoded
=
await
async_tokenizer
(
text
,
add_special_tokens
=
add_special_tokens
)
else
:
encoded
=
await
async_tokenizer
(
text
,
add_special_tokens
=
add_special_tokens
,
truncation
=
True
,
max_length
=
truncate_prompt_tokens
)
return
self
.
_create_tokens_prompt
(
encoded
.
input_ids
,
max_length
,
cache_salt
,
text
)
async
def
_maybe_detokenize
(
self
,
token_ids
:
list
[
int
],
max_length
:
Optional
[
int
],
truncate_prompt_tokens
:
Optional
[
int
],
cache_salt
:
Optional
[
str
],
needs_detokenization
:
Optional
[
bool
]
=
False
,
)
->
EngineTokensPrompt
:
"""Optionally detokenize token IDs and build a tokens prompt."""
token_ids
=
self
.
_maybe_apply_truncation
(
token_ids
,
truncate_prompt_tokens
)
prompt
=
None
if
needs_detokenization
is
True
:
async_tokenizer
=
self
.
_get_async_tokenizer
()
prompt
=
await
async_tokenizer
.
decode
(
token_ids
)
return
self
.
_create_tokens_prompt
(
token_ids
=
token_ids
,
max_length
=
max_length
,
cache_salt
=
cache_salt
,
prompt
=
prompt
)
def
_get_async_tokenizer
(
self
)
->
AsyncMicrobatchTokenizer
:
"""Get or create async tokenizer using shared pool."""
async_tokenizer
=
self
.
async_tokenizer
if
async_tokenizer
is
not
None
:
return
async_tokenizer
tokenizer
=
self
.
tokenizer
if
self
.
tokenizer
is
None
:
raise
ValueError
(
"No tokenizer available for text input processing"
)
if
self
.
async_tokenizer_pool
is
None
:
async_tokenizer
=
AsyncMicrobatchTokenizer
(
tokenizer
)
else
:
async_tokenizer
=
self
.
async_tokenizer_pool
.
get
(
tokenizer
)
if
async_tokenizer
is
None
:
async_tokenizer
=
AsyncMicrobatchTokenizer
(
tokenizer
)
self
.
async_tokenizer_pool
[
tokenizer
]
=
async_tokenizer
self
.
async_tokenizer
=
async_tokenizer
return
async_tokenizer
def
_create_tokens_prompt
(
self
,
token_ids
:
list
[
int
],
max_length
:
Optional
[
int
]
=
None
,
cache_salt
:
Optional
[
str
]
=
None
,
prompt
:
Optional
[
str
]
=
None
,
)
->
EngineTokensPrompt
:
"""Create validated EngineTokensPrompt."""
if
max_length
is
not
None
and
len
(
token_ids
)
>
max_length
:
raise
ValueError
(
f
"This maximum context length is
{
max_length
}
tokens. "
f
"However, your request has
{
len
(
token_ids
)
}
input tokens. "
"Please reduce the length of the input messages."
)
tokens_prompt
=
EngineTokensPrompt
(
prompt_token_ids
=
token_ids
)
if
cache_salt
is
not
None
:
tokens_prompt
[
"cache_salt"
]
=
cache_salt
if
prompt
is
not
None
:
tokens_prompt
[
"prompt"
]
=
prompt
return
tokens_prompt
vllm/entrypoints/tool.py
View file @
38d80967
...
...
@@ -4,6 +4,8 @@ import os
from
abc
import
ABC
,
abstractmethod
from
typing
import
TYPE_CHECKING
,
Any
from
openai_harmony
import
Author
,
Message
,
Role
,
TextContent
from
vllm.logger
import
init_logger
if
TYPE_CHECKING
:
...
...
@@ -99,6 +101,28 @@ class HarmonyPythonTool(Tool):
return
self
.
python_tool
=
PythonTool
()
async
def
validate
(
self
):
if
not
self
.
enabled
:
return
try
:
message
=
Message
(
author
=
Author
(
role
=
Role
.
ASSISTANT
),
content
=
[
TextContent
(
text
=
"print('Hello, world!')"
)],
channel
=
"analysis"
,
recipient
=
"python"
,
content_type
=
"code"
,
)
msgs
=
[]
async
for
msg
in
self
.
python_tool
.
process
(
message
):
msgs
.
append
(
msg
)
assert
msgs
[
0
].
content
[
0
].
text
==
"Hello, world!
\n
"
except
Exception
as
e
:
self
.
enabled
=
False
logger
.
warning_once
(
"Code interpreter tool failed to initialize (%s), code "
"interpreter is disabled"
,
e
)
return
logger
.
info_once
(
"Code interpreter tool initialized"
)
async
def
get_result
(
self
,
context
:
"ConversationContext"
)
->
Any
:
...
...
vllm/entrypoints/tool_server.py
View file @
38d80967
...
...
@@ -86,7 +86,8 @@ class ToolServer(ABC):
pass
@
abstractmethod
def
new_session
(
self
,
tool_name
:
str
)
->
AbstractAsyncContextManager
[
Any
]:
def
new_session
(
self
,
tool_name
:
str
,
session_id
:
str
)
->
AbstractAsyncContextManager
[
Any
]:
"""
Create a session for the tool.
"""
...
...
@@ -124,7 +125,8 @@ class MCPToolServer(ToolServer):
description
=
tool
.
description
,
parameters
=
tool
.
inputSchema
)
for
tool
in
list_tools_response
.
tools
])
],
)
self
.
harmony_tool_descriptions
[
tool_from_mcp
.
name
]
=
tool_from_mcp
if
tool_from_mcp
.
name
not
in
self
.
urls
:
self
.
urls
[
tool_from_mcp
.
name
]
=
url
...
...
@@ -142,14 +144,16 @@ class MCPToolServer(ToolServer):
return
self
.
harmony_tool_descriptions
.
get
(
tool_name
)
@
asynccontextmanager
async
def
new_session
(
self
,
tool_name
:
str
):
async
def
new_session
(
self
,
tool_name
:
str
,
session_id
:
str
):
from
mcp
import
ClientSession
from
mcp.client.sse
import
sse_client
url
=
self
.
urls
.
get
(
tool_name
)
headers
=
{
"x-session-id"
:
session_id
}
if
not
url
:
raise
KeyError
(
f
"Tool '
{
tool_name
}
' is not supported"
)
async
with
sse_client
(
url
=
url
)
as
streams
,
ClientSession
(
*
streams
)
as
session
:
async
with
sse_client
(
url
=
url
,
headers
=
headers
)
as
streams
,
ClientSession
(
*
streams
)
as
session
:
await
session
.
initialize
()
yield
session
...
...
@@ -158,10 +162,13 @@ class DemoToolServer(ToolServer):
def
__init__
(
self
):
self
.
tools
:
dict
[
str
,
Tool
]
=
{}
async
def
init_and_validate
(
self
):
browser_tool
=
HarmonyBrowserTool
()
python_tool
=
HarmonyPythonTool
()
await
python_tool
.
validate
()
if
browser_tool
.
enabled
:
self
.
tools
[
"browser"
]
=
browser_tool
python_tool
=
HarmonyPythonTool
()
if
python_tool
.
enabled
:
self
.
tools
[
"python"
]
=
python_tool
logger
.
info
(
"DemoToolServer initialized with tools: %s"
,
...
...
@@ -182,7 +189,7 @@ class DemoToolServer(ToolServer):
raise
ValueError
(
f
"Unknown tool
{
tool_name
}
"
)
@
asynccontextmanager
async
def
new_session
(
self
,
tool_name
:
str
):
async
def
new_session
(
self
,
tool_name
:
str
,
session_id
:
str
):
if
tool_name
not
in
self
.
tools
:
raise
KeyError
(
f
"Tool '
{
tool_name
}
' is not supported"
)
yield
self
.
tools
[
tool_name
]
vllm/env_override.py
View file @
38d80967
...
...
@@ -13,24 +13,6 @@ logger = init_logger(__name__)
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.
if
os
.
environ
.
get
(
'NCCL_CUMEM_ENABLE'
,
'0'
)
!=
'0'
:
logger
.
warning
(
"NCCL_CUMEM_ENABLE is set to %s, skipping override. "
"This may increase memory overhead with cudagraph+allreduce: "
"https://github.com/NVIDIA/nccl/issues/1234"
,
os
.
environ
[
'NCCL_CUMEM_ENABLE'
])
elif
not
os
.
path
.
exists
(
'/dev/nvidia-caps-imex-channels'
):
# NCCL requires NCCL_CUMEM_ENABLE to work with
# multi-node NVLink, typically on GB200-NVL72 systems.
# The ultimate way to detect multi-node NVLink is to use
# NVML APIs, which are too expensive to call here.
# As an approximation, we check the existence of
# /dev/nvidia-caps-imex-channels, used by
# multi-node NVLink to communicate across nodes.
# This will still cost some GPU memory, but it is worthwhile
# because we can get very fast cross-node bandwidth with NVLink.
os
.
environ
[
'NCCL_CUMEM_ENABLE'
]
=
'0'
# see https://github.com/vllm-project/vllm/pull/15951
# it avoids unintentional cuda initialization from torch.cuda.is_available()
os
.
environ
[
'PYTORCH_NVML_BASED_CUDA_CHECK'
]
=
'1'
...
...
Prev
1
…
19
20
21
22
23
24
25
26
27
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment