Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9dbb4c8d
Commit
9dbb4c8d
authored
Aug 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.1.1' into v0.10.1.1-dev
parents
034aae03
1da94e67
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
106 additions
and
30 deletions
+106
-30
vllm/entrypoints/constants.py
vllm/entrypoints/constants.py
+10
-0
vllm/entrypoints/launcher.py
vllm/entrypoints/launcher.py
+21
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+2
-0
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+8
-0
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+9
-1
vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
...entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+4
-9
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+5
-5
vllm/model_executor/layers/quantization/mxfp4.py
vllm/model_executor/layers/quantization/mxfp4.py
+46
-14
vllm/v1/attention/backends/mla/cutlass_mla.py
vllm/v1/attention/backends/mla/cutlass_mla.py
+1
-1
No files found.
vllm/entrypoints/constants.py
0 → 100644
View file @
9dbb4c8d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Shared constants for vLLM entrypoints.
"""
# HTTP header limits for h11 parser
# These constants help mitigate header abuse attacks
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
=
4194304
# 4 MB
H11_MAX_HEADER_COUNT_DEFAULT
=
256
vllm/entrypoints/launcher.py
View file @
9dbb4c8d
...
...
@@ -14,6 +14,8 @@ from vllm import envs
from
vllm.engine.async_llm_engine
import
AsyncEngineDeadError
from
vllm.engine.multiprocessing
import
MQEngineDeadError
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.constants
import
(
H11_MAX_HEADER_COUNT_DEFAULT
,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
)
from
vllm.entrypoints.ssl
import
SSLCertRefresher
from
vllm.logger
import
init_logger
from
vllm.utils
import
find_process_using_port
...
...
@@ -26,6 +28,11 @@ async def serve_http(app: FastAPI,
sock
:
Optional
[
socket
.
socket
],
enable_ssl_refresh
:
bool
=
False
,
**
uvicorn_kwargs
:
Any
):
"""
Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
options. Supports http header limits via h11_max_incomplete_event_size and
h11_max_header_count.
"""
logger
.
info
(
"Available routes are:"
)
for
route
in
app
.
routes
:
methods
=
getattr
(
route
,
"methods"
,
None
)
...
...
@@ -36,7 +43,21 @@ async def serve_http(app: FastAPI,
logger
.
info
(
"Route: %s, Methods: %s"
,
path
,
', '
.
join
(
methods
))
# Extract header limit options if present
h11_max_incomplete_event_size
=
uvicorn_kwargs
.
pop
(
"h11_max_incomplete_event_size"
,
None
)
h11_max_header_count
=
uvicorn_kwargs
.
pop
(
"h11_max_header_count"
,
None
)
# Set safe defaults if not provided
if
h11_max_incomplete_event_size
is
None
:
h11_max_incomplete_event_size
=
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
if
h11_max_header_count
is
None
:
h11_max_header_count
=
H11_MAX_HEADER_COUNT_DEFAULT
config
=
uvicorn
.
Config
(
app
,
**
uvicorn_kwargs
)
# Set header limits
config
.
h11_max_incomplete_event_size
=
h11_max_incomplete_event_size
config
.
h11_max_header_count
=
h11_max_header_count
config
.
load
()
server
=
uvicorn
.
Server
(
config
)
_add_shutdown_handlers
(
app
,
server
)
...
...
vllm/entrypoints/openai/api_server.py
View file @
9dbb4c8d
...
...
@@ -1894,6 +1894,8 @@ async def run_server_worker(listen_address,
ssl_certfile
=
args
.
ssl_certfile
,
ssl_ca_certs
=
args
.
ssl_ca_certs
,
ssl_cert_reqs
=
args
.
ssl_cert_reqs
,
h11_max_incomplete_event_size
=
args
.
h11_max_incomplete_event_size
,
h11_max_header_count
=
args
.
h11_max_header_count
,
**
uvicorn_kwargs
,
)
...
...
vllm/entrypoints/openai/cli_args.py
View file @
9dbb4c8d
...
...
@@ -20,6 +20,8 @@ from vllm.config import config
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
optional_type
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateContentFormatOption
,
validate_chat_template
)
from
vllm.entrypoints.constants
import
(
H11_MAX_HEADER_COUNT_DEFAULT
,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
)
from
vllm.entrypoints.openai.serving_models
import
LoRAModulePath
from
vllm.entrypoints.openai.tool_parsers
import
ToolParserManager
from
vllm.logger
import
init_logger
...
...
@@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
enable_log_outputs
:
bool
=
False
"""If set to True, enable logging of model outputs (generations)
in addition to the input logging that is enabled by default."""
h11_max_incomplete_event_size
:
int
=
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
"""Maximum size (bytes) of an incomplete HTTP event (header or body) for
h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
h11_max_header_count
:
int
=
H11_MAX_HEADER_COUNT_DEFAULT
"""Maximum number of HTTP headers allowed in a request for h11 parser.
Helps mitigate header abuse. Default: 256."""
@
staticmethod
def
add_cli_args
(
parser
:
FlexibleArgumentParser
)
->
FlexibleArgumentParser
:
...
...
vllm/entrypoints/openai/protocol.py
View file @
9dbb4c8d
...
...
@@ -20,7 +20,15 @@ from openai.types.chat.chat_completion_message import (
from
openai.types.responses
import
(
ResponseFunctionToolCall
,
ResponseInputItemParam
,
ResponseOutputItem
,
ResponsePrompt
,
ResponseReasoningItem
,
ResponseStatus
,
ResponseTextConfig
)
ResponseStatus
)
# Backward compatibility for OpenAI client versions
try
:
# For older openai versions (< 1.100.0)
from
openai.types.responses
import
ResponseTextConfig
except
ImportError
:
# For newer openai versions (>= 1.100.0)
from
openai.types.responses
import
(
ResponseFormatTextConfig
as
ResponseTextConfig
)
from
openai.types.responses.response
import
ToolChoice
from
openai.types.responses.tool
import
Tool
from
openai.types.shared
import
Metadata
,
Reasoning
...
...
vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
View file @
9dbb4c8d
...
...
@@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser):
"valid JSON object in tool '%s', will try other "
"methods to parse it."
,
param_value
,
param_name
,
func_name
)
try
:
converted_value
=
eval
(
param_value
)
return
converted_value
except
Exception
:
logger
.
warning
(
"Parsed value '%s' of parameter '%s' cannot be "
"converted via Python `eval()` in tool '%s', "
"degenerating to string."
,
param_value
,
param_name
,
func_name
)
"Parameter '%s' has unknown type '%s'. "
"The value will be treated as a string."
,
param_name
,
param_type
)
return
param_value
# Extract function name
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
9dbb4c8d
...
...
@@ -805,10 +805,10 @@ class FusedMoE(CustomOp):
self
.
global_num_experts
=
num_experts
+
num_redundant_experts
# we padding globally so EP buffer allocation works
if
(
quant_config
and
quant_config
.
get_name
()
==
"mxfp4"
and
(
current_platform
.
is_rocm
()
or
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
)
):
if
quant_config
and
quant_config
.
get_name
()
==
"mxfp4"
:
from
vllm.model_executor.layers.quantization.mxfp4
import
(
# noqa: E501
should_use_flashinfer_mxfp4
)
if
current_platform
.
is_rocm
()
or
should_use_flashinfer_mxfp4
(
):
hidden_size
=
round_up
(
hidden_size
,
256
)
# For smuggling this layer into the fused moe custom op
...
...
vllm/model_executor/layers/quantization/mxfp4.py
View file @
9dbb4c8d
...
...
@@ -6,6 +6,7 @@ import torch
from
torch.nn.parameter
import
Parameter
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe
import
(
FusedMoE
,
FusedMoEConfig
,
FusedMoEMethodBase
)
from
vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe
import
(
...
...
@@ -26,12 +27,38 @@ from vllm.platforms import current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.utils
import
(
has_triton_kernels
,
is_torch_equal_or_newer
,
next_power_of_2
,
round_up
)
from
vllm.utils.flashinfer
import
has_flashinfer
if
(
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
):
# from flashinfer.fused_moe import cutlass_fused_moe
from
flashinfer
import
(
mxfp8_quantize
,
shuffle_matrix_a
,
shuffle_matrix_sf_a
,
trtllm_fp4_block_scale_moe
)
logger
=
init_logger
(
__name__
)
def
_should_use_flashinfer_mxfp4_bf16
():
"""Determine if FlashInfer MXFP4 BF16 should be used."""
# If explicitly set, respect the setting
if
envs
.
is_set
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"
):
return
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
# Enable by default on SM100 if MXFP8 is not explicitly enabled
if
(
current_platform
.
is_device_capability
(
100
)
and
has_flashinfer
()
and
not
envs
.
is_set
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8"
)):
logger
.
info_once
(
"Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell. "
"For faster performance, consider setting "
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, "
"though this may impact accuracy."
)
return
True
return
False
def
_should_use_flashinfer_mxfp4_mxfp8
():
"""Determine if FlashInfer MXFP4 MXFP8 should be used."""
return
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
def
should_use_flashinfer_mxfp4
():
return
(
_should_use_flashinfer_mxfp4_mxfp8
()
or
_should_use_flashinfer_mxfp4_bf16
())
class
Mxfp4Config
(
QuantizationConfig
):
...
...
@@ -87,12 +114,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
self
.
moe
=
moe
self
.
use_marlin
=
self
.
_should_use_marlin
()
if
current_platform
.
is_device_capability
(
100
)
and
not
has_flashinfer
():
logger
.
warning_once
(
"MXFP4 MoE is enabled on Blackwell but FlashInfer "
"is not available. This may result in degraded performance. "
"Please `pip install vllm[flashinfer]` for best results."
)
def
_should_use_marlin
(
self
):
if
envs
.
VLLM_MXFP4_USE_MARLIN
is
not
None
:
return
envs
.
VLLM_MXFP4_USE_MARLIN
if
current_platform
.
is_cuda
()
and
\
not
current_platform
.
ha
s_device_capability
(
100
):
if
not
current_platform
.
i
s_device_capability
(
90
):
not
current_platform
.
i
s_device_capability
(
100
):
if
not
current_platform
.
ha
s_device_capability
(
90
):
# marlin kernel has better performance on ampere
return
True
if
not
has_triton_kernels
():
...
...
@@ -138,8 +171,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
layer
.
hidden_size
=
hidden_size
layer
.
intermediate_size_per_partition
=
\
intermediate_size_per_partition_after_pad
elif
(
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
):
elif
should_use_flashinfer_mxfp4
():
# pad the intermediate size to be a multiple of 2 * mxfp4_block
# for to hold non-uniform sharded tensor as well as swizzling
# other padding to increase performance
...
...
@@ -230,8 +262,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
def
process_weights_after_loading
(
self
,
layer
):
if
self
.
use_marlin
:
prepare_moe_fp4_layer_for_marlin
(
layer
)
elif
(
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
):
elif
should_use_flashinfer_mxfp4
():
from
flashinfer
import
shuffle_matrix_a
,
shuffle_matrix_sf_a
layer
.
gemm1_alpha
=
Parameter
(
torch
.
tensor
(
[
1.702
]
*
self
.
num_experts
,
dtype
=
torch
.
float32
).
cuda
(),
requires_grad
=
False
)
...
...
@@ -478,11 +510,11 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
logical_replica_count
),
(
"MXFP4 are not supported with this configuration."
)
if
(
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
):
if
should_use_flashinfer_mxfp4
():
from
flashinfer
import
mxfp8_quantize
,
trtllm_fp4_block_scale_moe
assert
not
self
.
moe
.
use_ep
,
(
"EP is not supported for flashinfer mxfp4 moe backend yet."
)
if
envs
.
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
:
if
_should_use_flashinfer_mxfp4_bf16
()
:
assert
x
.
dtype
==
torch
.
bfloat16
x_quant
=
x
x_scale
=
None
...
...
vllm/v1/attention/backends/mla/cutlass_mla.py
View file @
9dbb4c8d
...
...
@@ -21,7 +21,7 @@ logger = init_logger(__name__)
class
CutlassMLAMetadataBuilder
(
MLACommonMetadataBuilder
[
MLACommonMetadata
]):
# enable full CUDA Graph support for decode-only capture
attn_
cudagraph_support
:
ClassVar
[
cudagraph_support
:
ClassVar
[
AttentionCGSupport
]
=
AttentionCGSupport
.
UNIFORM_SINGLE_TOKEN_DECODE
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment