Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4497431d
Unverified
Commit
4497431d
authored
Mar 08, 2026
by
Sage
Committed by
GitHub
Mar 08, 2026
Browse files
[Frontend] Add GPU-less render serving path (`vllm launch render`) (#36166)
parent
b7332b05
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
712 additions
and
273 deletions
+712
-273
vllm/entrypoints/cli/launch.py
vllm/entrypoints/cli/launch.py
+5
-7
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+113
-3
vllm/entrypoints/openai/chat_completion/api_router.py
vllm/entrypoints/openai/chat_completion/api_router.py
+0
-29
vllm/entrypoints/openai/completion/api_router.py
vllm/entrypoints/openai/completion/api_router.py
+0
-27
vllm/entrypoints/openai/generate/api_router.py
vllm/entrypoints/openai/generate/api_router.py
+25
-2
vllm/entrypoints/serve/instrumentator/health.py
vllm/entrypoints/serve/instrumentator/health.py
+5
-1
vllm/entrypoints/serve/render/__init__.py
vllm/entrypoints/serve/render/__init__.py
+2
-0
vllm/entrypoints/serve/render/api_router.py
vllm/entrypoints/serve/render/api_router.py
+87
-0
vllm/entrypoints/serve/render/serving.py
vllm/entrypoints/serve/render/serving.py
+475
-0
vllm/v1/engine/launch.py
vllm/v1/engine/launch.py
+0
-204
No files found.
vllm/entrypoints/cli/launch.py
View file @
4497431d
...
...
@@ -8,7 +8,7 @@ import uvloop
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.cli.types
import
CLISubcommand
from
vllm.entrypoints.openai.api_server
import
(
build_and_serve
,
build_and_serve
_renderer
,
setup_server
,
)
from
vllm.entrypoints.openai.cli_args
import
(
...
...
@@ -109,19 +109,17 @@ def cmd_init() -> list[CLISubcommand]:
async
def
run_launch_fastapi
(
args
:
argparse
.
Namespace
)
->
None
:
"""Run the online serving layer with FastAPI (no GPU inference)."""
from
vllm.config
import
VllmConfig
from
vllm.v1.engine.launch
import
LaunchEngineClient
# 1. Socket binding
listen_address
,
sock
=
setup_server
(
args
)
# 2.
Create LaunchEngineClient (no GPU)
# 2.
Build and serve the API server
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
model_config
=
engine_args
.
create_model_config
()
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
engine_client
=
LaunchEngineClient
.
from_vllm_config
(
vllm_config
)
# 3. Build app, initialize state, and start serving
shutdown_task
=
await
build_and_serve
(
engine_client
,
listen_address
,
sock
,
args
)
shutdown_task
=
await
build_and_serve_renderer
(
vllm_config
,
listen_address
,
sock
,
args
)
try
:
await
shutdown_task
finally
:
...
...
vllm/entrypoints/openai/api_server.py
View file @
4497431d
...
...
@@ -22,6 +22,7 @@ from fastapi.middleware.cors import CORSMiddleware
from
starlette.datastructures
import
State
import
vllm.envs
as
envs
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
load_chat_template
...
...
@@ -198,7 +199,7 @@ def build_app(
register_sagemaker_api_router
(
app
,
supported_tasks
)
if
any
(
task
in
supported_tasks
for
task
in
(
"generate"
,
"render"
))
:
if
"generate"
in
supported_tasks
:
from
vllm.entrypoints.openai.generate.api_router
import
(
register_generate_api_routers
,
)
...
...
@@ -223,6 +224,13 @@ def build_app(
elastic_ep_attach_router
(
app
)
if
"generate"
in
supported_tasks
or
"render"
in
supported_tasks
:
from
vllm.entrypoints.serve.render.api_router
import
(
attach_router
as
attach_render_router
,
)
attach_render_router
(
app
)
if
"transcription"
in
supported_tasks
:
from
vllm.entrypoints.openai.speech_to_text.api_router
import
(
attach_router
as
register_speech_to_text_api_router
,
...
...
@@ -363,7 +371,7 @@ async def init_app_state(
trust_request_chat_template
=
args
.
trust_request_chat_template
,
)
if
any
(
task
in
supported_tasks
for
task
in
(
"generate"
,
"render"
))
:
if
"generate"
in
supported_tasks
:
from
vllm.entrypoints.openai.generate.api_router
import
init_generate_state
await
init_generate_state
(
...
...
@@ -393,6 +401,64 @@ async def init_app_state(
state
.
server_load_metrics
=
0
async
def
init_render_app_state
(
vllm_config
:
VllmConfig
,
state
:
State
,
args
:
Namespace
,
)
->
None
:
"""Initialise FastAPI app state for a CPU-only render server.
Unlike :func:`init_app_state` this function does not require an
:class:`~vllm.engine.protocol.EngineClient`; it bootstraps the
preprocessing pipeline (renderer, io_processor, input_processor)
directly from the :class:`~vllm.config.VllmConfig`.
"""
from
vllm.entrypoints.chat_utils
import
load_chat_template
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.plugins.io_processors
import
get_io_processor
from
vllm.renderers
import
renderer_from_config
served_model_names
=
args
.
served_model_name
or
[
args
.
model
]
if
args
.
enable_log_requests
:
request_logger
=
RequestLogger
(
max_log_len
=
args
.
max_log_len
)
else
:
request_logger
=
None
renderer
=
renderer_from_config
(
vllm_config
)
io_processor
=
get_io_processor
(
vllm_config
,
renderer
,
vllm_config
.
model_config
.
io_processor_plugin
)
resolved_chat_template
=
load_chat_template
(
args
.
chat_template
)
state
.
openai_serving_render
=
OpenAIServingRender
(
model_config
=
vllm_config
.
model_config
,
renderer
=
renderer
,
io_processor
=
io_processor
,
served_model_names
=
served_model_names
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
enable_auto_tools
=
args
.
enable_auto_tool_choice
,
exclude_tools_when_tool_choice_none
=
args
.
exclude_tools_when_tool_choice_none
,
tool_parser
=
args
.
tool_call_parser
,
default_chat_template_kwargs
=
args
.
default_chat_template_kwargs
,
log_error_stack
=
args
.
log_error_stack
,
)
# Expose models endpoint via the render handler.
state
.
openai_serving_models
=
state
.
openai_serving_render
state
.
vllm_config
=
vllm_config
# Disable stats logging — there is no engine to poll.
state
.
log_stats
=
False
state
.
engine_client
=
None
state
.
args
=
args
state
.
enable_server_load_tracking
=
False
state
.
server_load_metrics
=
0
def
create_server_socket
(
addr
:
tuple
[
str
,
int
])
->
socket
.
socket
:
family
=
socket
.
AF_INET
if
is_valid_ipv6_address
(
addr
[
0
]):
...
...
@@ -494,7 +560,6 @@ async def build_and_serve(
supported_tasks
=
await
engine_client
.
get_supported_tasks
()
logger
.
info
(
"Supported tasks: %s"
,
supported_tasks
)
app
=
build_app
(
args
,
supported_tasks
)
await
init_app_state
(
engine_client
,
app
.
state
,
args
,
supported_tasks
)
...
...
@@ -522,6 +587,51 @@ async def build_and_serve(
)
async
def
build_and_serve_renderer
(
vllm_config
:
VllmConfig
,
listen_address
:
str
,
sock
:
socket
.
socket
,
args
:
Namespace
,
**
uvicorn_kwargs
,
)
->
asyncio
.
Task
:
"""Build FastAPI app for a CPU-only render server, initialize state, and
start serving.
Returns the shutdown task for the caller to await.
"""
# Get uvicorn log config (from file or with endpoint filter)
log_config
=
get_uvicorn_log_config
(
args
)
if
log_config
is
not
None
:
uvicorn_kwargs
[
"log_config"
]
=
log_config
app
=
build_app
(
args
,
(
"render"
,))
await
init_render_app_state
(
vllm_config
,
app
.
state
,
args
)
logger
.
info
(
"Starting vLLM server on %s"
,
listen_address
)
return
await
serve_http
(
app
,
sock
=
sock
,
enable_ssl_refresh
=
args
.
enable_ssl_refresh
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
args
.
uvicorn_log_level
,
# NOTE: When the 'disable_uvicorn_access_log' value is True,
# no access log will be output.
access_log
=
not
args
.
disable_uvicorn_access_log
,
timeout_keep_alive
=
envs
.
VLLM_HTTP_TIMEOUT_KEEP_ALIVE
,
ssl_keyfile
=
args
.
ssl_keyfile
,
ssl_certfile
=
args
.
ssl_certfile
,
ssl_ca_certs
=
args
.
ssl_ca_certs
,
ssl_cert_reqs
=
args
.
ssl_cert_reqs
,
ssl_ciphers
=
args
.
ssl_ciphers
,
h11_max_incomplete_event_size
=
args
.
h11_max_incomplete_event_size
,
h11_max_header_count
=
args
.
h11_max_header_count
,
**
uvicorn_kwargs
,
)
async
def
run_server
(
args
,
**
uvicorn_kwargs
)
->
None
:
"""Run a single-worker API server."""
...
...
vllm/entrypoints/openai/chat_completion/api_router.py
View file @
4497431d
...
...
@@ -71,34 +71,5 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
return
StreamingResponse
(
content
=
generator
,
media_type
=
"text/event-stream"
)
@
router
.
post
(
"/v1/chat/completions/render"
,
dependencies
=
[
Depends
(
validate_json_request
)],
response_model
=
list
,
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
NOT_FOUND
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
NOT_IMPLEMENTED
.
value
:
{
"model"
:
ErrorResponse
},
},
)
async
def
render_chat_completion
(
request
:
ChatCompletionRequest
,
raw_request
:
Request
):
"""Render chat completion request and return conversation and engine
prompts without generating."""
handler
=
chat
(
raw_request
)
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
return
base_server
.
create_error_response
(
message
=
"The model does not support Chat Completions API"
)
result
=
await
handler
.
render_chat_request
(
request
)
if
isinstance
(
result
,
ErrorResponse
):
return
JSONResponse
(
content
=
result
.
model_dump
(),
status_code
=
result
.
error
.
code
)
return
JSONResponse
(
content
=
result
)
def
attach_router
(
app
:
FastAPI
):
app
.
include_router
(
router
)
vllm/entrypoints/openai/completion/api_router.py
View file @
4497431d
...
...
@@ -69,32 +69,5 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
return
StreamingResponse
(
content
=
generator
,
media_type
=
"text/event-stream"
)
@
router
.
post
(
"/v1/completions/render"
,
dependencies
=
[
Depends
(
validate_json_request
)],
response_model
=
list
,
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
NOT_FOUND
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
async
def
render_completion
(
request
:
CompletionRequest
,
raw_request
:
Request
):
"""render completion request and return engine prompts without generating."""
handler
=
completion
(
raw_request
)
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
return
base_server
.
create_error_response
(
message
=
"The model does not support Completions API"
)
result
=
await
handler
.
render_completion_request
(
request
)
if
isinstance
(
result
,
ErrorResponse
):
return
JSONResponse
(
content
=
result
.
model_dump
(),
status_code
=
result
.
error
.
code
)
return
JSONResponse
(
content
=
result
)
def
attach_router
(
app
:
FastAPI
):
app
.
include_router
(
router
)
vllm/entrypoints/openai/generate/api_router.py
View file @
4497431d
...
...
@@ -111,7 +111,7 @@ async def init_generate_state(
enable_log_outputs
=
args
.
enable_log_outputs
,
enable_log_deltas
=
args
.
enable_log_deltas
,
)
if
any
(
task
in
supported_tasks
for
task
in
(
"generate"
,
"render"
))
if
"generate"
in
supported_tasks
else
None
)
# Warm up chat template processing to avoid first-request latency
...
...
@@ -126,7 +126,7 @@ async def init_generate_state(
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
enable_force_include_usage
=
args
.
enable_force_include_usage
,
)
if
any
(
task
in
supported_tasks
for
task
in
(
"generate"
,
"render"
))
if
"generate"
in
supported_tasks
else
None
)
state
.
anthropic_serving_messages
=
(
...
...
@@ -160,3 +160,26 @@ async def init_generate_state(
if
"generate"
in
supported_tasks
else
None
)
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
state
.
openai_serving_render
=
OpenAIServingRender
(
model_config
=
engine_client
.
model_config
,
renderer
=
engine_client
.
renderer
,
io_processor
=
engine_client
.
io_processor
,
served_model_names
=
[
mp
.
name
for
mp
in
state
.
openai_serving_models
.
base_model_paths
],
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
enable_auto_tools
=
args
.
enable_auto_tool_choice
,
exclude_tools_when_tool_choice_none
=
args
.
exclude_tools_when_tool_choice_none
,
tool_parser
=
args
.
tool_call_parser
,
default_chat_template_kwargs
=
args
.
default_chat_template_kwargs
,
log_error_stack
=
args
.
log_error_stack
,
)
vllm/entrypoints/serve/instrumentator/health.py
View file @
4497431d
...
...
@@ -22,8 +22,12 @@ def engine_client(request: Request) -> EngineClient:
@
router
.
get
(
"/health"
,
response_class
=
Response
)
async
def
health
(
raw_request
:
Request
)
->
Response
:
"""Health check."""
client
=
engine_client
(
raw_request
)
if
client
is
None
:
# Render-only servers have no engine; they are always healthy.
return
Response
(
status_code
=
200
)
try
:
await
engine_client
(
raw_request
)
.
check_health
()
await
client
.
check_health
()
return
Response
(
status_code
=
200
)
except
EngineDeadError
:
return
Response
(
status_code
=
503
)
vllm/entrypoints/serve/render/__init__.py
0 → 100644
View file @
4497431d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
vllm/entrypoints/serve/render/api_router.py
0 → 100644
View file @
4497431d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
FastAPI
,
Request
from
fastapi.responses
import
JSONResponse
from
vllm.entrypoints.openai.chat_completion.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.completion.protocol
import
CompletionRequest
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.utils
import
create_error_response
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
router
=
APIRouter
()
def
render
(
request
:
Request
)
->
OpenAIServingRender
|
None
:
return
getattr
(
request
.
app
.
state
,
"openai_serving_render"
,
None
)
@
router
.
post
(
"/v1/chat/completions/render"
,
dependencies
=
[
Depends
(
validate_json_request
)],
response_model
=
list
,
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
NOT_FOUND
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
NOT_IMPLEMENTED
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
async
def
render_chat_completion
(
request
:
ChatCompletionRequest
,
raw_request
:
Request
):
handler
=
render
(
raw_request
)
if
handler
is
None
:
error
=
create_error_response
(
message
=
"The model does not support Chat Completions Render API"
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
)
return
JSONResponse
(
status_code
=
HTTPStatus
.
NOT_FOUND
,
content
=
error
.
model_dump
()
)
result
=
await
handler
.
render_chat_request
(
request
)
if
isinstance
(
result
,
ErrorResponse
):
return
JSONResponse
(
content
=
result
.
model_dump
(),
status_code
=
result
.
error
.
code
)
return
JSONResponse
(
content
=
result
)
@
router
.
post
(
"/v1/completions/render"
,
dependencies
=
[
Depends
(
validate_json_request
)],
response_model
=
list
,
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
NOT_FOUND
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
async
def
render_completion
(
request
:
CompletionRequest
,
raw_request
:
Request
):
handler
=
render
(
raw_request
)
if
handler
is
None
:
error
=
create_error_response
(
message
=
"The model does not support Completions Render API"
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
)
return
JSONResponse
(
status_code
=
HTTPStatus
.
NOT_FOUND
,
content
=
error
.
model_dump
()
)
result
=
await
handler
.
render_completion_request
(
request
)
if
isinstance
(
result
,
ErrorResponse
):
return
JSONResponse
(
content
=
result
.
model_dump
(),
status_code
=
result
.
error
.
code
)
return
JSONResponse
(
content
=
result
)
def
attach_router
(
app
:
FastAPI
)
->
None
:
app
.
include_router
(
router
)
vllm/entrypoints/serve/render/serving.py
0 → 100644
View file @
4497431d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
sys
import
traceback
from
collections.abc
import
Callable
,
Sequence
from
http
import
HTTPStatus
from
typing
import
Any
import
jinja2
from
openai_harmony
import
Message
as
OpenAIMessage
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateContentFormatOption
,
ConversationMessage
,
)
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.chat_completion.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.completion.protocol
import
CompletionRequest
from
vllm.entrypoints.openai.engine.protocol
import
(
ErrorInfo
,
ErrorResponse
,
ModelCard
,
ModelList
,
ModelPermission
,
)
from
vllm.entrypoints.openai.parser.harmony_utils
import
(
get_developer_message
,
get_system_message
,
parse_chat_inputs_to_harmony_messages
,
render_for_completion
,
)
from
vllm.entrypoints.utils
import
sanitize_message
from
vllm.inputs.data
import
ProcessorInputs
,
PromptType
,
SingletonPrompt
,
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.parser
import
ParserManager
from
vllm.renderers
import
BaseRenderer
,
merge_kwargs
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
,
prompt_to_seq
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
from
vllm.utils.mistral
import
is_mistral_tokenizer
from
vllm.utils.mistral
import
mt
as
_mt
logger
=
init_logger
(
__name__
)
class
OpenAIServingRender
:
def
__init__
(
self
,
model_config
:
ModelConfig
,
renderer
:
BaseRenderer
,
io_processor
:
Any
,
served_model_names
:
list
[
str
],
*
,
request_logger
:
RequestLogger
|
None
,
chat_template
:
str
|
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
trust_request_chat_template
:
bool
=
False
,
enable_auto_tools
:
bool
=
False
,
exclude_tools_when_tool_choice_none
:
bool
=
False
,
tool_parser
:
str
|
None
=
None
,
default_chat_template_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
log_error_stack
:
bool
=
False
,
)
->
None
:
self
.
model_config
=
model_config
self
.
renderer
=
renderer
self
.
io_processor
=
io_processor
self
.
served_model_names
=
served_model_names
self
.
request_logger
=
request_logger
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
ChatTemplateContentFormatOption
=
(
chat_template_content_format
)
self
.
trust_request_chat_template
=
trust_request_chat_template
self
.
enable_auto_tools
=
enable_auto_tools
self
.
exclude_tools_when_tool_choice_none
=
exclude_tools_when_tool_choice_none
self
.
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
=
(
ParserManager
.
get_tool_parser
(
tool_parser_name
=
tool_parser
,
enable_auto_tools
=
enable_auto_tools
,
model_name
=
model_config
.
model
,
)
)
self
.
default_chat_template_kwargs
:
dict
[
str
,
Any
]
=
(
default_chat_template_kwargs
or
{}
)
self
.
log_error_stack
=
log_error_stack
self
.
use_harmony
=
model_config
.
hf_config
.
model_type
==
"gpt_oss"
self
.
supports_browsing
=
False
self
.
supports_code_interpreter
=
False
async
def
render_chat_request
(
self
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""Copied from OpenAIServingChat.render_chat_request.
Differences: engine_client.errored check removed (no engine client).
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
logger
.
error
(
"Error with model %s"
,
error_check_ret
)
return
error_check_ret
try
:
tokenizer
=
self
.
renderer
.
tokenizer
tool_parser
=
self
.
tool_parser
if
is_mistral_tokenizer
(
tokenizer
):
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
_mt
.
truncate_tool_call_ids
(
request
)
# type: ignore[arg-type]
_mt
.
validate_request_params
(
request
)
# Check if tool parsing is unavailable (common condition)
tool_parsing_unavailable
=
(
tool_parser
is
None
and
not
is_mistral_tokenizer
(
tokenizer
)
and
not
self
.
use_harmony
)
# Validate tool_choice when tool parsing is required but unavailable
if
tool_parsing_unavailable
and
request
.
tool_choice
not
in
(
None
,
"none"
,
):
if
request
.
tool_choice
==
"auto"
and
not
self
.
enable_auto_tools
:
# for hf tokenizers, "auto" tools requires
# --enable-auto-tool-choice and --tool-call-parser
return
self
.
create_error_response
(
'"auto" tool choice requires '
"--enable-auto-tool-choice and --tool-call-parser to be set"
)
elif
request
.
tool_choice
!=
"auto"
:
# "required" or named tool requires tool parser
return
self
.
create_error_response
(
f
'tool_choice="
{
request
.
tool_choice
}
" requires '
"--tool-call-parser to be set"
)
if
request
.
tools
is
None
or
(
request
.
tool_choice
==
"none"
and
self
.
exclude_tools_when_tool_choice_none
):
tool_dicts
=
None
else
:
tool_dicts
=
[
tool
.
model_dump
()
for
tool
in
request
.
tools
]
if
not
self
.
use_harmony
:
# Common case.
error_check_ret
=
self
.
_validate_chat_template
(
request_chat_template
=
request
.
chat_template
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
if
error_check_ret
is
not
None
:
return
error_check_ret
conversation
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template_content_format
=
self
.
chat_template_content_format
,
default_template_kwargs
=
self
.
default_chat_template_kwargs
,
tool_dicts
=
tool_dicts
,
tool_parser
=
tool_parser
,
)
else
:
# For GPT-OSS.
should_include_tools
=
tool_dicts
is
not
None
conversation
,
engine_prompts
=
self
.
_make_request_with_harmony
(
request
,
should_include_tools
)
except
(
ValueError
,
TypeError
,
RuntimeError
,
jinja2
.
TemplateError
)
as
e
:
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
e
)
return
conversation
,
engine_prompts
async
def
render_completion_request
(
self
,
request
:
CompletionRequest
,
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""Copied from OpenAIServingCompletion.render_completion_request.
Differences: engine_client.errored check removed (no engine client).
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
# Return error for unsupported features.
if
request
.
suffix
is
not
None
:
return
self
.
create_error_response
(
"suffix is not currently supported"
)
if
request
.
echo
and
request
.
prompt_embeds
is
not
None
:
return
self
.
create_error_response
(
"Echo is unsupported with prompt embeds."
)
if
request
.
prompt_logprobs
is
not
None
and
request
.
prompt_embeds
is
not
None
:
return
self
.
create_error_response
(
"prompt_logprobs is not compatible with prompt embeds."
)
try
:
engine_prompts
=
await
self
.
_preprocess_completion
(
request
,
prompt_input
=
request
.
prompt
,
prompt_embeds
=
request
.
prompt_embeds
,
)
except
(
ValueError
,
TypeError
,
RuntimeError
,
jinja2
.
TemplateError
)
as
e
:
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
e
)
return
engine_prompts
def
_make_request_with_harmony
(
self
,
request
:
ChatCompletionRequest
,
should_include_tools
:
bool
=
True
,
):
"""Copied from OpenAIServingChat._make_request_with_harmony."""
messages
:
list
[
OpenAIMessage
]
=
[]
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
# Add system message.
# NOTE: In Chat Completion API, browsing is enabled by default
# if the model supports it. TODO: Support browsing.
assert
not
self
.
supports_browsing
assert
not
self
.
supports_code_interpreter
sys_msg
=
get_system_message
(
reasoning_effort
=
request
.
reasoning_effort
,
browser_description
=
None
,
python_description
=
None
,
with_custom_tools
=
should_include_tools
,
)
messages
.
append
(
sys_msg
)
# Add developer message.
if
request
.
tools
:
dev_msg
=
get_developer_message
(
tools
=
request
.
tools
if
should_include_tools
else
None
# type: ignore[arg-type]
)
messages
.
append
(
dev_msg
)
# Add user message.
messages
.
extend
(
parse_chat_inputs_to_harmony_messages
(
request
.
messages
))
# Render prompt token ids.
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
engine_prompt
]
async
def
show_available_models
(
self
)
->
ModelList
:
"""Returns the models served by this render server."""
max_model_len
=
self
.
model_config
.
max_model_len
return
ModelList
(
data
=
[
ModelCard
(
id
=
name
,
max_model_len
=
max_model_len
,
root
=
self
.
model_config
.
model
,
permission
=
[
ModelPermission
()],
)
for
name
in
self
.
served_model_names
]
)
def
create_error_response
(
self
,
message
:
str
|
Exception
,
err_type
:
str
=
"BadRequestError"
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
,
param
:
str
|
None
=
None
,
)
->
ErrorResponse
:
"""Copied from OpenAIServing.create_error_response."""
exc
:
Exception
|
None
=
None
if
isinstance
(
message
,
Exception
):
exc
=
message
from
vllm.exceptions
import
VLLMValidationError
if
isinstance
(
exc
,
VLLMValidationError
):
err_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
param
=
exc
.
parameter
elif
isinstance
(
exc
,
(
ValueError
,
TypeError
,
RuntimeError
,
OverflowError
)):
# Common validation errors from user input
err_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
param
=
None
elif
isinstance
(
exc
,
NotImplementedError
):
err_type
=
"NotImplementedError"
status_code
=
HTTPStatus
.
NOT_IMPLEMENTED
param
=
None
elif
exc
.
__class__
.
__name__
==
"TemplateError"
:
# jinja2.TemplateError (avoid importing jinja2)
err_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
param
=
None
else
:
err_type
=
"InternalServerError"
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
param
=
None
message
=
str
(
exc
)
if
self
.
log_error_stack
:
exc_type
,
_
,
_
=
sys
.
exc_info
()
if
exc_type
is
not
None
:
traceback
.
print_exc
()
else
:
traceback
.
print_stack
()
return
ErrorResponse
(
error
=
ErrorInfo
(
message
=
sanitize_message
(
message
),
type
=
err_type
,
code
=
status_code
.
value
,
param
=
param
,
)
)
def
_is_model_supported
(
self
,
model_name
:
str
)
->
bool
:
"""Simplified from OpenAIServing._is_model_supported (no LoRA support)."""
return
model_name
in
self
.
served_model_names
async
def
_check_model
(
self
,
request
:
Any
,
)
->
ErrorResponse
|
None
:
"""Simplified from OpenAIServing._check_model (no LoRA support)."""
if
self
.
_is_model_supported
(
request
.
model
):
return
None
return
self
.
create_error_response
(
message
=
f
"The model `
{
request
.
model
}
` does not exist."
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
param
=
"model"
,
)
def
_validate_chat_template
(
self
,
request_chat_template
:
str
|
None
,
chat_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
trust_request_chat_template
:
bool
,
)
->
ErrorResponse
|
None
:
"""Copied from OpenAIServing._validate_chat_template."""
if
not
trust_request_chat_template
and
(
request_chat_template
is
not
None
or
(
chat_template_kwargs
and
chat_template_kwargs
.
get
(
"chat_template"
)
is
not
None
)
):
return
self
.
create_error_response
(
"Chat template is passed with request, but "
"--trust-request-chat-template is not set. "
"Refused request with untrusted chat template."
)
return
None
async
def
_preprocess_completion
(
self
,
request
:
Any
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
)
->
list
[
ProcessorInputs
]:
"""Copied from OpenAIServing._preprocess_completion."""
prompts
=
list
[
SingletonPrompt
|
bytes
]()
if
prompt_embeds
is
not
None
:
# embeds take higher priority
prompts
.
extend
(
prompt_to_seq
(
prompt_embeds
))
if
prompt_input
is
not
None
:
prompts
.
extend
(
prompt_to_seq
(
prompt_input
))
return
await
self
.
_preprocess_cmpl
(
request
,
prompts
)
async
def
_preprocess_cmpl
(
self
,
request
:
Any
,
prompts
:
Sequence
[
PromptType
|
bytes
],
)
->
list
[
ProcessorInputs
]:
"""Copied from OpenAIServing._preprocess_cmpl."""
renderer
=
self
.
renderer
model_config
=
self
.
model_config
parsed_prompts
=
[
(
prompt
if
isinstance
(
prompt
,
bytes
)
else
parse_model_prompt
(
model_config
,
prompt
)
)
for
prompt
in
prompts
]
tok_params
=
request
.
build_tok_params
(
model_config
)
return
await
renderer
.
render_cmpl_async
(
parsed_prompts
,
tok_params
,
prompt_extras
=
{
k
:
v
for
k
in
(
"mm_processor_kwargs"
,
"cache_salt"
)
if
(
v
:
=
getattr
(
request
,
k
,
None
))
is
not
None
},
)
async
def
_preprocess_chat
(
self
,
request
:
Any
,
messages
:
list
[
Any
],
default_template
:
str
|
None
,
default_template_content_format
:
ChatTemplateContentFormatOption
,
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]:
"""Copied from OpenAIServing._preprocess_chat.
Differences: isinstance check is ChatCompletionRequest-only
(ResponsesRequest not supported here); TODO comment dropped accordingly.
"""
renderer
=
self
.
renderer
default_template_kwargs
=
merge_kwargs
(
default_template_kwargs
,
dict
(
tools
=
tool_dicts
,
tokenize
=
is_mistral_tokenizer
(
renderer
.
tokenizer
),
),
)
tok_params
=
request
.
build_tok_params
(
self
.
model_config
)
chat_params
=
request
.
build_chat_params
(
default_template
,
default_template_content_format
).
with_defaults
(
default_template_kwargs
)
(
conversation
,),
(
engine_prompt
,)
=
await
renderer
.
render_chat_async
(
[
messages
],
chat_params
,
tok_params
,
prompt_extras
=
{
k
:
v
for
k
in
(
"mm_processor_kwargs"
,
"cache_salt"
)
if
(
v
:
=
getattr
(
request
,
k
,
None
))
is
not
None
},
)
# tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser
# is set, we want to prevent parsing a tool_call hallucinated by the LLM
if
tool_parser
is
not
None
:
tool_choice
=
getattr
(
request
,
"tool_choice"
,
"none"
)
if
tool_choice
!=
"none"
:
if
not
isinstance
(
request
,
ChatCompletionRequest
):
msg
=
(
"Tool usage is only supported "
" for ChatCompletionRequest, but got "
f
"
{
type
(
request
).
__name__
}
"
)
raise
NotImplementedError
(
msg
)
tokenizer
=
renderer
.
get_tokenizer
()
request
=
tool_parser
(
tokenizer
).
adjust_request
(
request
=
request
)
# type: ignore[arg-type]
return
conversation
,
[
engine_prompt
]
vllm/v1/engine/launch.py
deleted
100644 → 0
View file @
b7332b05
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
LaunchEngineClient: A lightweight EngineClient for GPU-less online serving.
This implements the EngineClient protocol without AsyncLLM or EngineCore,
enabling preprocessing (tokenization, rendering) and postprocessing
(detokenization) without GPU inference.
"""
from
collections.abc
import
AsyncGenerator
,
Iterable
,
Mapping
from
typing
import
Any
from
vllm.config
import
VllmConfig
from
vllm.engine.protocol
import
EngineClient
,
StreamingInput
from
vllm.inputs
import
ProcessorInputs
,
PromptType
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
from
vllm.plugins.io_processors
import
get_io_processor
from
vllm.pooling_params
import
PoolingParams
from
vllm.renderers
import
renderer_from_config
from
vllm.sampling_params
import
SamplingParams
from
vllm.tasks
import
SupportedTask
from
vllm.v1.engine
import
EngineCoreRequest
,
PauseMode
from
vllm.v1.engine.input_processor
import
InputProcessor
logger
=
init_logger
(
__name__
)
class
LaunchEngineClient
(
EngineClient
):
"""GPU-less EngineClient that only supports preprocessing/postprocessing.
This is a Null Object at the EngineClient level, bypassing AsyncLLM
entirely. It initializes renderer, io_processor, and input_processor
for tokenization and rendering, but raises NotImplementedError for
any inference-related operations.
"""
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
)
->
None
:
self
.
vllm_config
=
vllm_config
self
.
model_config
=
vllm_config
.
model_config
self
.
renderer
=
renderer
=
renderer_from_config
(
self
.
vllm_config
)
self
.
io_processor
=
get_io_processor
(
self
.
vllm_config
,
self
.
renderer
,
self
.
model_config
.
io_processor_plugin
,
)
# Convert TokPrompt --> EngineCoreRequest.
self
.
input_processor
=
InputProcessor
(
self
.
vllm_config
,
renderer
)
@
classmethod
def
from_vllm_config
(
cls
,
vllm_config
:
VllmConfig
,
)
->
"LaunchEngineClient"
:
"""Create a LaunchEngineClient from a VllmConfig without GPU."""
return
cls
(
vllm_config
=
vllm_config
,
)
# -- Task support --
async
def
get_supported_tasks
(
self
)
->
tuple
[
SupportedTask
,
...]:
return
(
"render"
,)
# -- Inference (not supported) --
async
def
generate
(
self
,
prompt
:
EngineCoreRequest
|
PromptType
|
ProcessorInputs
|
AsyncGenerator
[
StreamingInput
,
None
],
sampling_params
:
SamplingParams
,
request_id
:
str
,
*
,
prompt_text
:
str
|
None
=
None
,
lora_request
:
LoRARequest
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
=
None
,
priority
:
int
=
0
,
data_parallel_rank
:
int
|
None
=
None
,
reasoning_ended
:
bool
|
None
=
None
,
)
->
AsyncGenerator
[
RequestOutput
,
None
]:
raise
NotImplementedError
(
"LaunchEngineClient does not support inference. "
"Use vllm serve for generation requests."
)
# yield is needed to make this an async generator
yield
# type: ignore[misc] # pragma: no cover
# -- Request management (no-op) --
async
def
abort
(
self
,
request_id
:
str
|
Iterable
[
str
],
internal
:
bool
=
False
)
->
None
:
pass
# -- Generation control (no-op) --
async
def
pause_generation
(
self
,
*
,
mode
:
PauseMode
=
"abort"
,
wait_for_inflight_requests
:
bool
|
None
=
None
,
clear_cache
:
bool
=
True
,
)
->
None
:
pass
async
def
resume_generation
(
self
)
->
None
:
pass
async
def
is_paused
(
self
)
->
bool
:
return
False
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
pass
async
def
encode
(
self
,
prompt
:
PromptType
|
ProcessorInputs
,
pooling_params
:
PoolingParams
,
request_id
:
str
,
lora_request
:
LoRARequest
|
None
=
None
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
=
None
,
priority
:
int
=
0
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
reasoning_ended
:
bool
|
None
=
None
,
)
->
AsyncGenerator
[
PoolingRequestOutput
,
None
]:
raise
NotImplementedError
(
"LaunchEngineClient does not support inference. "
"Use vllm serve for encoding requests."
)
yield
# type: ignore[misc] # pragma: no cover
# -- Observability (no-op / defaults) --
async
def
is_tracing_enabled
(
self
)
->
bool
:
return
False
async
def
do_log_stats
(
self
)
->
None
:
pass
async
def
check_health
(
self
)
->
None
:
pass
async
def
start_profile
(
self
)
->
None
:
pass
async
def
stop_profile
(
self
)
->
None
:
pass
# -- Cache management (no-op) --
async
def
reset_mm_cache
(
self
)
->
None
:
pass
async
def
reset_prefix_cache
(
self
,
reset_running_requests
:
bool
=
False
,
reset_connector
:
bool
=
False
)
->
bool
:
return
True
async
def
reset_encoder_cache
(
self
)
->
None
:
pass
# -- Power management (no-op) --
async
def
sleep
(
self
,
level
:
int
=
1
,
mode
:
PauseMode
=
"abort"
)
->
None
:
pass
async
def
wake_up
(
self
,
tags
:
list
[
str
]
|
None
=
None
)
->
None
:
pass
async
def
is_sleeping
(
self
)
->
bool
:
return
False
# -- LoRA (not supported) --
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
return
False
# -- Status properties --
@
property
def
is_running
(
self
)
->
bool
:
return
True
@
property
def
is_stopped
(
self
)
->
bool
:
return
False
@
property
def
errored
(
self
)
->
bool
:
return
False
@
property
def
dead_error
(
self
)
->
BaseException
:
return
RuntimeError
(
"LaunchEngineClient does not support inference"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment