Commit 7a5df8f7 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.1.1' into v0.10.1.1-ori

parents 5876ee95 1da94e67
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Shared constants for vLLM entrypoints.
"""
# HTTP header limits for h11 parser
# These constants help mitigate header abuse attacks
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB
H11_MAX_HEADER_COUNT_DEFAULT = 256
...@@ -14,6 +14,8 @@ from vllm import envs ...@@ -14,6 +14,8 @@ from vllm import envs
from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.async_llm_engine import AsyncEngineDeadError
from vllm.engine.multiprocessing import MQEngineDeadError from vllm.engine.multiprocessing import MQEngineDeadError
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
from vllm.entrypoints.ssl import SSLCertRefresher from vllm.entrypoints.ssl import SSLCertRefresher
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import find_process_using_port from vllm.utils import find_process_using_port
...@@ -26,6 +28,11 @@ async def serve_http(app: FastAPI, ...@@ -26,6 +28,11 @@ async def serve_http(app: FastAPI,
sock: Optional[socket.socket], sock: Optional[socket.socket],
enable_ssl_refresh: bool = False, enable_ssl_refresh: bool = False,
**uvicorn_kwargs: Any): **uvicorn_kwargs: Any):
"""
Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
options. Supports http header limits via h11_max_incomplete_event_size and
h11_max_header_count.
"""
logger.info("Available routes are:") logger.info("Available routes are:")
for route in app.routes: for route in app.routes:
methods = getattr(route, "methods", None) methods = getattr(route, "methods", None)
...@@ -36,7 +43,21 @@ async def serve_http(app: FastAPI, ...@@ -36,7 +43,21 @@ async def serve_http(app: FastAPI,
logger.info("Route: %s, Methods: %s", path, ', '.join(methods)) logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
# Extract header limit options if present
h11_max_incomplete_event_size = uvicorn_kwargs.pop(
"h11_max_incomplete_event_size", None)
h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)
# Set safe defaults if not provided
if h11_max_incomplete_event_size is None:
h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
if h11_max_header_count is None:
h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT
config = uvicorn.Config(app, **uvicorn_kwargs) config = uvicorn.Config(app, **uvicorn_kwargs)
# Set header limits
config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
config.h11_max_header_count = h11_max_header_count
config.load() config.load()
server = uvicorn.Server(config) server = uvicorn.Server(config)
_add_shutdown_handlers(app, server) _add_shutdown_handlers(app, server)
......
...@@ -1894,6 +1894,8 @@ async def run_server_worker(listen_address, ...@@ -1894,6 +1894,8 @@ async def run_server_worker(listen_address,
ssl_certfile=args.ssl_certfile, ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs, ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs, ssl_cert_reqs=args.ssl_cert_reqs,
h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
h11_max_header_count=args.h11_max_header_count,
**uvicorn_kwargs, **uvicorn_kwargs,
) )
......
...@@ -20,6 +20,8 @@ from vllm.config import config ...@@ -20,6 +20,8 @@ from vllm.config import config
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
validate_chat_template) validate_chat_template)
from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" ...@@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
enable_log_outputs: bool = False enable_log_outputs: bool = False
"""If set to True, enable logging of model outputs (generations) """If set to True, enable logging of model outputs (generations)
in addition to the input logging that is enabled by default.""" in addition to the input logging that is enabled by default."""
h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
"""Maximum size (bytes) of an incomplete HTTP event (header or body) for
h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
"""Maximum number of HTTP headers allowed in a request for h11 parser.
Helps mitigate header abuse. Default: 256."""
@staticmethod @staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
......
...@@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser): ...@@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser):
"valid JSON object in tool '%s', will try other " "valid JSON object in tool '%s', will try other "
"methods to parse it.", param_value, param_name, "methods to parse it.", param_value, param_name,
func_name) func_name)
try: logger.warning(
converted_value = eval(param_value) "Parameter '%s' has unknown type '%s'. "
return converted_value "The value will be treated as a string.", param_name,
except Exception: param_type)
logger.warning(
"Parsed value '%s' of parameter '%s' cannot be "
"converted via Python `eval()` in tool '%s', "
"degenerating to string.", param_value, param_name,
func_name)
return param_value return param_value
# Extract function name # Extract function name
......
...@@ -21,7 +21,7 @@ logger = init_logger(__name__) ...@@ -21,7 +21,7 @@ logger = init_logger(__name__)
class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
# enable full CUDA Graph support for decode-only capture # enable full CUDA Graph support for decode-only capture
attn_cudagraph_support: ClassVar[ cudagraph_support: ClassVar[
AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment