Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a5df8f7
Commit
7a5df8f7
authored
Aug 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.1.1' into v0.10.1.1-ori
parents
5876ee95
1da94e67
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
46 additions
and
10 deletions
+46
-10
vllm/entrypoints/constants.py
vllm/entrypoints/constants.py
+10
-0
vllm/entrypoints/launcher.py
vllm/entrypoints/launcher.py
+21
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+2
-0
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+8
-0
vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
...entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+4
-9
vllm/v1/attention/backends/mla/cutlass_mla.py
vllm/v1/attention/backends/mla/cutlass_mla.py
+1
-1
No files found.
vllm/entrypoints/constants.py
0 → 100644
View file @
7a5df8f7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Shared constants for vLLM entrypoints.
"""
# HTTP header limits for h11 parser
# These constants help mitigate header abuse attacks
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
=
4194304
# 4 MB
H11_MAX_HEADER_COUNT_DEFAULT
=
256
vllm/entrypoints/launcher.py
View file @
7a5df8f7
...
@@ -14,6 +14,8 @@ from vllm import envs
...
@@ -14,6 +14,8 @@ from vllm import envs
from
vllm.engine.async_llm_engine
import
AsyncEngineDeadError
from
vllm.engine.async_llm_engine
import
AsyncEngineDeadError
from
vllm.engine.multiprocessing
import
MQEngineDeadError
from
vllm.engine.multiprocessing
import
MQEngineDeadError
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.constants
import
(
H11_MAX_HEADER_COUNT_DEFAULT
,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
)
from
vllm.entrypoints.ssl
import
SSLCertRefresher
from
vllm.entrypoints.ssl
import
SSLCertRefresher
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
find_process_using_port
from
vllm.utils
import
find_process_using_port
...
@@ -26,6 +28,11 @@ async def serve_http(app: FastAPI,
...
@@ -26,6 +28,11 @@ async def serve_http(app: FastAPI,
sock
:
Optional
[
socket
.
socket
],
sock
:
Optional
[
socket
.
socket
],
enable_ssl_refresh
:
bool
=
False
,
enable_ssl_refresh
:
bool
=
False
,
**
uvicorn_kwargs
:
Any
):
**
uvicorn_kwargs
:
Any
):
"""
Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
options. Supports http header limits via h11_max_incomplete_event_size and
h11_max_header_count.
"""
logger
.
info
(
"Available routes are:"
)
logger
.
info
(
"Available routes are:"
)
for
route
in
app
.
routes
:
for
route
in
app
.
routes
:
methods
=
getattr
(
route
,
"methods"
,
None
)
methods
=
getattr
(
route
,
"methods"
,
None
)
...
@@ -36,7 +43,21 @@ async def serve_http(app: FastAPI,
...
@@ -36,7 +43,21 @@ async def serve_http(app: FastAPI,
logger
.
info
(
"Route: %s, Methods: %s"
,
path
,
', '
.
join
(
methods
))
logger
.
info
(
"Route: %s, Methods: %s"
,
path
,
', '
.
join
(
methods
))
# Extract header limit options if present
h11_max_incomplete_event_size
=
uvicorn_kwargs
.
pop
(
"h11_max_incomplete_event_size"
,
None
)
h11_max_header_count
=
uvicorn_kwargs
.
pop
(
"h11_max_header_count"
,
None
)
# Set safe defaults if not provided
if
h11_max_incomplete_event_size
is
None
:
h11_max_incomplete_event_size
=
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
if
h11_max_header_count
is
None
:
h11_max_header_count
=
H11_MAX_HEADER_COUNT_DEFAULT
config
=
uvicorn
.
Config
(
app
,
**
uvicorn_kwargs
)
config
=
uvicorn
.
Config
(
app
,
**
uvicorn_kwargs
)
# Set header limits
config
.
h11_max_incomplete_event_size
=
h11_max_incomplete_event_size
config
.
h11_max_header_count
=
h11_max_header_count
config
.
load
()
config
.
load
()
server
=
uvicorn
.
Server
(
config
)
server
=
uvicorn
.
Server
(
config
)
_add_shutdown_handlers
(
app
,
server
)
_add_shutdown_handlers
(
app
,
server
)
...
...
vllm/entrypoints/openai/api_server.py
View file @
7a5df8f7
...
@@ -1894,6 +1894,8 @@ async def run_server_worker(listen_address,
...
@@ -1894,6 +1894,8 @@ async def run_server_worker(listen_address,
ssl_certfile
=
args
.
ssl_certfile
,
ssl_certfile
=
args
.
ssl_certfile
,
ssl_ca_certs
=
args
.
ssl_ca_certs
,
ssl_ca_certs
=
args
.
ssl_ca_certs
,
ssl_cert_reqs
=
args
.
ssl_cert_reqs
,
ssl_cert_reqs
=
args
.
ssl_cert_reqs
,
h11_max_incomplete_event_size
=
args
.
h11_max_incomplete_event_size
,
h11_max_header_count
=
args
.
h11_max_header_count
,
**
uvicorn_kwargs
,
**
uvicorn_kwargs
,
)
)
...
...
vllm/entrypoints/openai/cli_args.py
View file @
7a5df8f7
...
@@ -20,6 +20,8 @@ from vllm.config import config
...
@@ -20,6 +20,8 @@ from vllm.config import config
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
optional_type
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
optional_type
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateContentFormatOption
,
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateContentFormatOption
,
validate_chat_template
)
validate_chat_template
)
from
vllm.entrypoints.constants
import
(
H11_MAX_HEADER_COUNT_DEFAULT
,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
)
from
vllm.entrypoints.openai.serving_models
import
LoRAModulePath
from
vllm.entrypoints.openai.serving_models
import
LoRAModulePath
from
vllm.entrypoints.openai.tool_parsers
import
ToolParserManager
from
vllm.entrypoints.openai.tool_parsers
import
ToolParserManager
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
...
@@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
enable_log_outputs
:
bool
=
False
enable_log_outputs
:
bool
=
False
"""If set to True, enable logging of model outputs (generations)
"""If set to True, enable logging of model outputs (generations)
in addition to the input logging that is enabled by default."""
in addition to the input logging that is enabled by default."""
h11_max_incomplete_event_size
:
int
=
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
"""Maximum size (bytes) of an incomplete HTTP event (header or body) for
h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
h11_max_header_count
:
int
=
H11_MAX_HEADER_COUNT_DEFAULT
"""Maximum number of HTTP headers allowed in a request for h11 parser.
Helps mitigate header abuse. Default: 256."""
@
staticmethod
@
staticmethod
def
add_cli_args
(
parser
:
FlexibleArgumentParser
)
->
FlexibleArgumentParser
:
def
add_cli_args
(
parser
:
FlexibleArgumentParser
)
->
FlexibleArgumentParser
:
...
...
vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
View file @
7a5df8f7
...
@@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser):
...
@@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser):
"valid JSON object in tool '%s', will try other "
"valid JSON object in tool '%s', will try other "
"methods to parse it."
,
param_value
,
param_name
,
"methods to parse it."
,
param_value
,
param_name
,
func_name
)
func_name
)
try
:
logger
.
warning
(
converted_value
=
eval
(
param_value
)
"Parameter '%s' has unknown type '%s'. "
return
converted_value
"The value will be treated as a string."
,
param_name
,
except
Exception
:
param_type
)
logger
.
warning
(
"Parsed value '%s' of parameter '%s' cannot be "
"converted via Python `eval()` in tool '%s', "
"degenerating to string."
,
param_value
,
param_name
,
func_name
)
return
param_value
return
param_value
# Extract function name
# Extract function name
...
...
vllm/v1/attention/backends/mla/cutlass_mla.py
View file @
7a5df8f7
...
@@ -21,7 +21,7 @@ logger = init_logger(__name__)
...
@@ -21,7 +21,7 @@ logger = init_logger(__name__)
class
CutlassMLAMetadataBuilder
(
MLACommonMetadataBuilder
[
MLACommonMetadata
]):
class
CutlassMLAMetadataBuilder
(
MLACommonMetadataBuilder
[
MLACommonMetadata
]):
# enable full CUDA Graph support for decode-only capture
# enable full CUDA Graph support for decode-only capture
attn_
cudagraph_support
:
ClassVar
[
cudagraph_support
:
ClassVar
[
AttentionCGSupport
]
=
AttentionCGSupport
.
UNIFORM_SINGLE_TOKEN_DECODE
AttentionCGSupport
]
=
AttentionCGSupport
.
UNIFORM_SINGLE_TOKEN_DECODE
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment