Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
"vscode:/vscode.git/clone" did not exist on "b522c4476fcdaee254fe40fefb354a4908fccac5"
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
222 additions
and
120 deletions
+222
-120
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+1
-1
examples/offline_inference/qwen2_5_omni/only_thinker.py
examples/offline_inference/qwen2_5_omni/only_thinker.py
+1
-1
examples/offline_inference/qwen3_omni/only_thinker.py
examples/offline_inference/qwen3_omni/only_thinker.py
+1
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+28
-1
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+3
-3
examples/online_serving/run_cluster.sh
examples/online_serving/run_cluster.sh
+34
-26
examples/online_serving/structured_outputs/structured_outputs.py
...s/online_serving/structured_outputs/structured_outputs.py
+1
-1
examples/pooling/plugin/prithvi_geospatial_mae_client.py
examples/pooling/plugin/prithvi_geospatial_mae_client.py
+1
-1
examples/pooling/pooling/vision_language_pooling.py
examples/pooling/pooling/vision_language_pooling.py
+3
-3
examples/pooling/score/offline_reranker.py
examples/pooling/score/offline_reranker.py
+0
-0
examples/pooling/score/openai_reranker.py
examples/pooling/score/openai_reranker.py
+0
-0
mkdocs.yaml
mkdocs.yaml
+1
-0
requirements/common.txt
requirements/common.txt
+2
-1
requirements/rocm-test.txt
requirements/rocm-test.txt
+1
-1
tests/benchmarks/test_param_sweep.py
tests/benchmarks/test_param_sweep.py
+0
-8
tests/compile/distributed/test_fusions_e2e.py
tests/compile/distributed/test_fusions_e2e.py
+106
-2
tests/compile/test_config.py
tests/compile/test_config.py
+1
-62
tests/compile/test_dynamic_shapes_compilation.py
tests/compile/test_dynamic_shapes_compilation.py
+8
-2
tests/compile/test_functionalization.py
tests/compile/test_functionalization.py
+1
-4
tests/conftest.py
tests/conftest.py
+29
-2
No files found.
examples/offline_inference/encoder_decoder_multimodal.py
View file @
a3f8d5dd
...
@@ -77,7 +77,7 @@ def parse_args():
...
@@ -77,7 +77,7 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--seed"
,
"--seed"
,
type
=
int
,
type
=
int
,
default
=
None
,
default
=
0
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
)
)
return
parser
.
parse_args
()
return
parser
.
parse_args
()
...
...
examples/offline_inference/qwen2_5_omni/only_thinker.py
View file @
a3f8d5dd
...
@@ -158,7 +158,7 @@ def parse_args():
...
@@ -158,7 +158,7 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--seed"
,
"--seed"
,
type
=
int
,
type
=
int
,
default
=
None
,
default
=
0
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
)
)
...
...
examples/offline_inference/qwen3_omni/only_thinker.py
View file @
a3f8d5dd
...
@@ -158,7 +158,7 @@ def parse_args():
...
@@ -158,7 +158,7 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--seed"
,
"--seed"
,
type
=
int
,
type
=
int
,
default
=
None
,
default
=
0
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
)
)
...
...
examples/offline_inference/vision_language.py
View file @
a3f8d5dd
...
@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
...
@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
)
)
def
run_bagel
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"ByteDance-Seed/BAGEL-7B-MoT"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
(
f
"<|im_start|>user
\n
<|image_pad|>
\n
{
question
}
<|im_end|>
\n
"
f
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# BLIP-2
# BLIP-2
def
run_blip2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
def
run_blip2
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
...
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
...
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_example_map
=
{
model_example_map
=
{
"aria"
:
run_aria
,
"aria"
:
run_aria
,
"aya_vision"
:
run_aya_vision
,
"aya_vision"
:
run_aya_vision
,
"bagel"
:
run_bagel
,
"bee"
:
run_bee
,
"bee"
:
run_bee
,
"blip-2"
:
run_blip2
,
"blip-2"
:
run_blip2
,
"chameleon"
:
run_chameleon
,
"chameleon"
:
run_chameleon
,
...
@@ -2031,7 +2058,7 @@ def parse_args():
...
@@ -2031,7 +2058,7 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--seed"
,
"--seed"
,
type
=
int
,
type
=
int
,
default
=
None
,
default
=
0
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
)
)
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
a3f8d5dd
...
@@ -1382,7 +1382,7 @@ def run_generate(
...
@@ -1382,7 +1382,7 @@ def run_generate(
model
,
model
,
question
:
str
,
question
:
str
,
image_urls
:
list
[
str
],
image_urls
:
list
[
str
],
seed
:
int
|
None
,
seed
:
int
,
tensor_parallel_size
:
int
|
None
,
tensor_parallel_size
:
int
|
None
,
):
):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
...
@@ -1416,7 +1416,7 @@ def run_chat(
...
@@ -1416,7 +1416,7 @@ def run_chat(
model
:
str
,
model
:
str
,
question
:
str
,
question
:
str
,
image_urls
:
list
[
str
],
image_urls
:
list
[
str
],
seed
:
int
|
None
,
seed
:
int
,
tensor_parallel_size
:
int
|
None
,
tensor_parallel_size
:
int
|
None
,
):
):
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
req_data
=
model_example_map
[
model
](
question
,
image_urls
)
...
@@ -1494,7 +1494,7 @@ def parse_args():
...
@@ -1494,7 +1494,7 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--seed"
,
"--seed"
,
type
=
int
,
type
=
int
,
default
=
None
,
default
=
0
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
...
...
examples/online_serving/run_cluster.sh
View file @
a3f8d5dd
...
@@ -21,7 +21,7 @@
...
@@ -21,7 +21,7 @@
# --worker \
# --worker \
# /abs/path/to/huggingface/cache \
# /abs/path/to/huggingface/cache \
# -e VLLM_HOST_IP=<worker_node_ip>
# -e VLLM_HOST_IP=<worker_node_ip>
#
#
# Each worker requires a unique VLLM_HOST_IP value.
# Each worker requires a unique VLLM_HOST_IP value.
# Keep each terminal session open. Closing a session stops the associated Ray
# Keep each terminal session open. Closing a session stops the associated Ray
# node and thereby shuts down the entire cluster.
# node and thereby shuts down the entire cluster.
...
@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
...
@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
exit
1
exit
1
fi
fi
# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
VLLM_HOST_IP
=
""
for
((
i
=
0
;
i <
${#
ADDITIONAL_ARGS
[@]
}
;
i++
))
;
do
arg
=
"
${
ADDITIONAL_ARGS
[
$i
]
}
"
case
"
${
arg
}
"
in
-e
)
next
=
"
${
ADDITIONAL_ARGS
[
$((
i
+
1
))
]
:-}
"
if
[[
"
${
next
}
"
==
VLLM_HOST_IP
=
*
]]
;
then
VLLM_HOST_IP
=
"
${
next
#VLLM_HOST_IP=
}
"
break
fi
;;
-eVLLM_HOST_IP
=
*
|
VLLM_HOST_IP
=
*
)
VLLM_HOST_IP
=
"
${
arg
#*=
}
"
break
;;
esac
done
# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
if
[[
"
${
NODE_TYPE
}
"
==
"--head"
&&
-n
"
${
VLLM_HOST_IP
}
"
]]
;
then
if
[[
"
${
VLLM_HOST_IP
}
"
!=
"
${
HEAD_NODE_ADDRESS
}
"
]]
;
then
echo
"Warning: VLLM_HOST_IP (
${
VLLM_HOST_IP
}
) differs from head_node_ip (
${
HEAD_NODE_ADDRESS
}
)."
echo
"Using VLLM_HOST_IP as the head node address."
HEAD_NODE_ADDRESS
=
"
${
VLLM_HOST_IP
}
"
fi
fi
# Generate a unique container name with random suffix.
# Generate a unique container name with random suffix.
# Docker container names must be unique on each host.
# Docker container names must be unique on each host.
# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
...
@@ -74,36 +102,17 @@ cleanup() {
...
@@ -74,36 +102,17 @@ cleanup() {
trap
cleanup EXIT
trap
cleanup EXIT
# Build the Ray start command based on the node role.
# Build the Ray start command based on the node role.
# The head node manages the cluster and accepts connections on port 6379,
# The head node manages the cluster and accepts connections on port 6379,
# while workers connect to the head's address.
# while workers connect to the head's address.
RAY_START_CMD
=
"ray start --block"
RAY_START_CMD
=
"ray start --block"
if
[
"
${
NODE_TYPE
}
"
==
"--head"
]
;
then
if
[
"
${
NODE_TYPE
}
"
==
"--head"
]
;
then
RAY_START_CMD+
=
" --head --port=6379"
RAY_START_CMD+
=
" --head
--node-ip-address=
${
HEAD_NODE_ADDRESS
}
--port=6379"
else
else
RAY_START_CMD+
=
" --address=
${
HEAD_NODE_ADDRESS
}
:6379"
fi
# Parse VLLM_HOST_IP from additional args if present.
RAY_START_CMD+
=
" --address=
${
HEAD_NODE_ADDRESS
}
:6379"
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
if
[
-n
"
${
VLLM_HOST_IP
}
"
]
;
then
VLLM_HOST_IP
=
""
RAY_START_CMD+
=
" --node-ip-address=
${
VLLM_HOST_IP
}
"
for
arg
in
"
${
ADDITIONAL_ARGS
[@]
}
"
;
do
if
[[
$arg
==
"-e"
]]
;
then
continue
fi
if
[[
$arg
==
VLLM_HOST_IP
=
*
]]
;
then
VLLM_HOST_IP
=
"
${
arg
#VLLM_HOST_IP=
}
"
break
fi
fi
done
# Build Ray IP environment variables if VLLM_HOST_IP is set.
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
RAY_IP_VARS
=()
if
[
-n
"
${
VLLM_HOST_IP
}
"
]
;
then
RAY_IP_VARS
=(
-e
"RAY_NODE_IP_ADDRESS=
${
VLLM_HOST_IP
}
"
-e
"RAY_OVERRIDE_NODE_IP_ADDRESS=
${
VLLM_HOST_IP
}
"
)
fi
fi
# Launch the container with the assembled parameters.
# Launch the container with the assembled parameters.
...
@@ -118,6 +127,5 @@ docker run \
...
@@ -118,6 +127,5 @@ docker run \
--shm-size
10.24g
\
--shm-size
10.24g
\
--gpus
all
\
--gpus
all
\
-v
"
${
PATH_TO_HF_HOME
}
:/root/.cache/huggingface"
\
-v
"
${
PATH_TO_HF_HOME
}
:/root/.cache/huggingface"
\
"
${
RAY_IP_VARS
[@]
}
"
\
"
${
ADDITIONAL_ARGS
[@]
}
"
\
"
${
ADDITIONAL_ARGS
[@]
}
"
\
"
${
DOCKER_IMAGE
}
"
-c
"
${
RAY_START_CMD
}
"
"
${
DOCKER_IMAGE
}
"
-c
"
${
RAY_START_CMD
}
"
examples/online_serving/structured_outputs/structured_outputs.py
View file @
a3f8d5dd
...
@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
...
@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"messages"
:
[
"messages"
:
[
{
{
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
,
"content"
:
"Generate an SQL query to show the 'username' and 'email'
from the 'users' table."
,
}
}
],
],
"extra_body"
:
{
"extra_body"
:
{
...
...
examples/pooling/plugin/prithvi_geospatial_mae_client.py
View file @
a3f8d5dd
...
@@ -16,7 +16,7 @@ import requests
...
@@ -16,7 +16,7 @@ import requests
# - start vllm in serving mode with the below args
# - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model-impl terratorch
# --model-impl terratorch
#
--task embed
--trust-remote-code
# --trust-remote-code
# --skip-tokenizer-init --enforce-eager
# --skip-tokenizer-init --enforce-eager
# --io-processor-plugin terratorch_segmentation
# --io-processor-plugin terratorch_segmentation
# --enable-mm-embeds
# --enable-mm-embeds
...
...
examples/pooling/pooling/vision_language_pooling.py
View file @
a3f8d5dd
...
@@ -305,7 +305,7 @@ def get_query(modality: QueryModality):
...
@@ -305,7 +305,7 @@ def get_query(modality: QueryModality):
raise
ValueError
(
msg
)
raise
ValueError
(
msg
)
def
run_encode
(
model
:
str
,
modality
:
QueryModality
,
seed
:
int
|
None
):
def
run_encode
(
model
:
str
,
modality
:
QueryModality
,
seed
:
int
):
query
=
get_query
(
modality
)
query
=
get_query
(
modality
)
req_data
=
model_example_map
[
model
](
query
)
req_data
=
model_example_map
[
model
](
query
)
...
@@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None):
...
@@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None):
print
(
"-"
*
50
)
print
(
"-"
*
50
)
def
run_score
(
model
:
str
,
modality
:
QueryModality
,
seed
:
int
|
None
):
def
run_score
(
model
:
str
,
modality
:
QueryModality
,
seed
:
int
):
query
=
get_query
(
modality
)
query
=
get_query
(
modality
)
req_data
=
model_example_map
[
model
](
query
)
req_data
=
model_example_map
[
model
](
query
)
...
@@ -390,7 +390,7 @@ def parse_args():
...
@@ -390,7 +390,7 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--seed"
,
"--seed"
,
type
=
int
,
type
=
int
,
default
=
None
,
default
=
0
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
)
)
return
parser
.
parse_args
()
return
parser
.
parse_args
()
...
...
examples/pooling/score/
qwen3
_reranker.py
→
examples/pooling/score/
offline
_reranker.py
View file @
a3f8d5dd
File moved
examples/pooling/score/
jina
ai_rerank
_client
.py
→
examples/pooling/score/
open
ai_rerank
er
.py
View file @
a3f8d5dd
File moved
mkdocs.yaml
View file @
a3f8d5dd
...
@@ -51,6 +51,7 @@ hooks:
...
@@ -51,6 +51,7 @@ hooks:
-
docs/mkdocs/hooks/remove_announcement.py
-
docs/mkdocs/hooks/remove_announcement.py
-
docs/mkdocs/hooks/generate_examples.py
-
docs/mkdocs/hooks/generate_examples.py
-
docs/mkdocs/hooks/generate_argparse.py
-
docs/mkdocs/hooks/generate_argparse.py
-
docs/mkdocs/hooks/generate_metrics.py
-
docs/mkdocs/hooks/url_schemes.py
-
docs/mkdocs/hooks/url_schemes.py
plugins
:
plugins
:
...
...
requirements/common.txt
View file @
a3f8d5dd
...
@@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser
...
@@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser
setproctitle # Used to set process names for better debugging and monitoring
setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss
openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0
anthropic == 0.71.0
model-hosting-container-standards >= 0.1.9, < 1.0.0
model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file
mcp
\ No newline at end of file
requirements/rocm-test.txt
View file @
a3f8d5dd
...
@@ -75,7 +75,7 @@ torchgeo==0.7.0
...
@@ -75,7 +75,7 @@ torchgeo==0.7.0
mteb==2.1.2
mteb==2.1.2
# Data processing
# Data processing
xgrammar
==0.1.27
xgrammar
@ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
# Test async scheduling
# Test async scheduling
# Utilities
# Utilities
...
...
tests/benchmarks/test_param_sweep.py
View file @
a3f8d5dd
...
@@ -23,14 +23,6 @@ class TestParameterSweepItem:
...
@@ -23,14 +23,6 @@ class TestParameterSweepItem:
{
"compilation_config.use_inductor_graph_partition"
:
True
},
{
"compilation_config.use_inductor_graph_partition"
:
True
},
"--compilation-config.use_inductor_graph_partition=true"
,
"--compilation-config.use_inductor_graph_partition=true"
,
),
),
(
{
"compilation_config.use_inductor"
:
False
},
"--compilation-config.use_inductor=false"
,
),
(
{
"compilation_config.use_inductor"
:
True
},
"--compilation-config.use_inductor=true"
,
),
],
],
)
)
def
test_nested_boolean_params
(
self
,
input_dict
,
expected
):
def
test_nested_boolean_params
(
self
,
input_dict
,
expected
):
...
...
tests/compile/distributed/test_fusions_e2e.py
View file @
a3f8d5dd
...
@@ -20,13 +20,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
...
@@ -20,13 +20,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
from
...utils
import
flat_product
,
multi_gpu_test
from
...utils
import
flat_product
,
multi_gpu_test
is_blackwell
=
lambda
:
current_platform
.
is_device_capability
(
100
)
is_blackwell
=
lambda
:
current_platform
.
is_device_capability
_family
(
100
)
"""Are we running on Blackwell, a lot of tests depend on it"""
"""Are we running on Blackwell, a lot of tests depend on it"""
class
Matches
(
NamedTuple
):
class
Matches
(
NamedTuple
):
attention_fusion
:
int
=
0
attention_fusion
:
int
=
0
allreduce_fusion
:
int
=
0
allreduce_fusion
:
int
=
0
rms_quant_norm_fusion
:
int
=
0
sequence_parallel
:
int
=
0
sequence_parallel
:
int
=
0
async_tp
:
int
=
0
async_tp
:
int
=
0
...
@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple):
...
@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple):
MODELS_FP8
:
list
[
ModelBackendTestCase
]
=
[]
MODELS_FP8
:
list
[
ModelBackendTestCase
]
=
[]
MODELS_FP4
:
list
[
ModelBackendTestCase
]
=
[]
MODELS_FP4
:
list
[
ModelBackendTestCase
]
=
[]
MODELS_GROUP_FP8
:
list
[
ModelBackendTestCase
]
=
[]
MODELS
:
list
[
ModelBackendTestCase
]
=
[]
# tp-only
MODELS
:
list
[
ModelBackendTestCase
]
=
[]
# tp-only
if
current_platform
.
is_cuda
():
if
current_platform
.
is_cuda
():
...
@@ -138,6 +140,17 @@ elif current_platform.is_rocm():
...
@@ -138,6 +140,17 @@ elif current_platform.is_rocm():
CUSTOM_OPS_FP8
=
[
"-quant_fp8"
,
"+quant_fp8"
]
CUSTOM_OPS_FP8
=
[
"-quant_fp8"
,
"+quant_fp8"
]
def
has_cuda_graph_wrapper_metadata
()
->
bool
:
from
importlib
import
import_module
try
:
module
=
import_module
(
"torch._inductor.utils"
)
module
.
CUDAGraphWrapperMetadata
# noqa B018
except
AttributeError
:
return
False
return
True
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name, model_kwargs, backend, matches, custom_ops"
,
"model_name, model_kwargs, backend, matches, custom_ops"
,
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
...
@@ -145,7 +158,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
...
@@ -145,7 +158,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
# quant_fp4 only has the custom impl
# quant_fp4 only has the custom impl
+
list
(
flat_product
(
MODELS_FP4
,
[
""
])),
+
list
(
flat_product
(
MODELS_FP4
,
[
""
])),
)
)
@
pytest
.
mark
.
parametrize
(
"inductor_graph_partition"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"inductor_graph_partition"
,
[
pytest
.
param
(
True
,
marks
=
pytest
.
mark
.
skipif
(
not
has_cuda_graph_wrapper_metadata
(),
reason
=
"This test requires"
"torch._inductor.utils.CUDAGraphWrapperMetadata to run"
,
),
),
False
,
],
)
def
test_attn_quant
(
def
test_attn_quant
(
model_name
:
str
,
model_name
:
str
,
model_kwargs
:
dict
[
str
,
Any
],
model_kwargs
:
dict
[
str
,
Any
],
...
@@ -474,3 +500,81 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
...
@@ -474,3 +500,81 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
compilation_config
.
compile_ranges_split_points
=
(
compilation_config
.
compile_ranges_split_points
=
(
llm
.
llm_engine
.
vllm_config
.
compilation_config
.
compile_ranges_split_points
llm
.
llm_engine
.
vllm_config
.
compilation_config
.
compile_ranges_split_points
)
)
if
current_platform
.
is_cuda
():
MODELS_GROUP_FP8
=
[
ModelBackendTestCase
(
model_name
=
"Qwen/Qwen3-30B-A3B-FP8"
,
model_kwargs
=
dict
(
max_model_len
=
1024
,
kv_cache_dtype
=
"fp8"
),
backend
=
AttentionBackendEnum
.
TRITON_ATTN
,
matches
=
Matches
(
rms_quant_norm_fusion
=
48
,
),
),
]
CUSTOM_OPS_QUANT_RMS_NORM
=
[
"+quant_fp8,+rms_norm"
]
@
pytest
.
mark
.
parametrize
(
"model_name, model_kwargs, backend, matches, custom_ops"
,
# Test rms norm+group quant_fp8 fusion
list
[
tuple
[
Any
,
...]](
flat_product
(
MODELS_GROUP_FP8
,
CUSTOM_OPS_QUANT_RMS_NORM
)),
)
@
pytest
.
mark
.
parametrize
(
"inductor_graph_partition"
,
[
True
,
False
])
# TODO: remove skip after we fix the fusion thoroughly
@
pytest
.
mark
.
skipif
(
is_blackwell
(),
reason
=
"Temporarily disabled on Blackwell"
)
def
test_rms_group_quant
(
model_name
:
str
,
model_kwargs
:
dict
[
str
,
Any
],
backend
:
AttentionBackendEnum
,
matches
:
Matches
,
custom_ops
:
str
,
inductor_graph_partition
:
bool
,
caplog_mp_spawn
,
monkeypatch
,
):
if
inductor_graph_partition
and
not
is_torch_equal_or_newer
(
"2.9.0.dev"
):
pytest
.
skip
(
"Inductor graph partition requires torch>=2.9"
)
custom_ops_list
=
custom_ops
.
split
(
","
)
if
custom_ops
else
[]
if
inductor_graph_partition
:
mode
=
CUDAGraphMode
.
FULL_AND_PIECEWISE
splitting_ops
:
list
[
str
]
|
None
=
None
else
:
mode
=
CUDAGraphMode
.
FULL_DECODE_ONLY
splitting_ops
=
[]
# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
.
name
)
compilation_config
=
CompilationConfig
(
# Testing properties
custom_ops
=
custom_ops_list
,
use_inductor_graph_partition
=
inductor_graph_partition
,
cudagraph_mode
=
mode
,
splitting_ops
=
splitting_ops
,
# Common
mode
=
CompilationMode
.
VLLM_COMPILE
,
pass_config
=
PassConfig
(
eliminate_noops
=
True
,
fuse_norm_quant
=
True
),
# Inductor caches custom passes by default as well via uuid
inductor_compile_config
=
{
"force_disable_caches"
:
True
},
)
with
caplog_mp_spawn
(
logging
.
DEBUG
)
as
log_holder
:
run_model
(
compilation_config
,
model_name
,
**
model_kwargs
)
log_matches
=
re
.
findall
(
r
"\[fusion.py:\d+] Replaced (\d+) patterns"
,
log_holder
.
text
,
)
assert
len
(
log_matches
)
==
1
,
log_holder
.
text
assert
int
(
log_matches
[
0
])
==
matches
.
rms_quant_norm_fusion
tests/compile/test_config.py
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
copy
import
copy
import
logging
from
contextlib
import
nullcontext
from
contextlib
import
nullcontext
from
unittest.mock
import
patch
from
unittest.mock
import
patch
...
@@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
...
@@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from
vllm.config
import
CompilationConfig
,
CUDAGraphMode
,
ParallelConfig
,
VllmConfig
from
vllm.config
import
CompilationConfig
,
CUDAGraphMode
,
ParallelConfig
,
VllmConfig
from
vllm.config.compilation
import
CompilationMode
,
PassConfig
from
vllm.config.compilation
import
CompilationMode
,
PassConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.logger
import
_print_warning_once
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
_is_torch_equal_or_newer
from
vllm.utils.torch_utils
import
_is_torch_equal_or_newer
...
@@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
...
@@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
),
),
compilation_config
=
CompilationConfig
(
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
mode
=
CompilationMode
.
VLLM_COMPILE
,
pass_config
=
{
"
enabl
e_attn_
fusion
"
:
True
,
"e
nabl
e_noop"
:
True
},
pass_config
=
{
"
fus
e_attn_
quant
"
:
True
,
"e
liminat
e_noop
s
"
:
True
},
custom_ops
=
[
"+quant_fp8"
],
custom_ops
=
[
"+quant_fp8"
],
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
,
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
,
),
),
...
@@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init(
...
@@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init(
vllm_config
.
compilation_config
.
max_cudagraph_capture_size
vllm_config
.
compilation_config
.
max_cudagraph_capture_size
==
expected_max_size
==
expected_max_size
)
)
def
test_pass_config_deprecation
(
caplog_vllm
):
caplog_vllm
.
set_level
(
logging
.
WARNING
)
# Clear cache to ensure warnings are re-issued
_print_warning_once
.
cache_clear
()
# Test enable_fusion -> fuse_norm_quant, fuse_act_quant
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_fusion
=
True
)
assert
"enable_fusion is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_norm_quant
is
True
assert
config
.
fuse_act_quant
is
True
assert
config
.
enable_fusion
is
True
# Test enable_attn_fusion -> fuse_attn_quant
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_attn_fusion
=
True
)
assert
"enable_attn_fusion is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_attn_quant
is
True
assert
config
.
enable_attn_fusion
is
True
# Test enable_noop -> eliminate_noops
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_noop
=
True
)
assert
"enable_noop is deprecated"
in
caplog_vllm
.
text
assert
config
.
eliminate_noops
is
True
assert
config
.
enable_noop
is
True
# Test enable_sequence_parallelism -> enable_sp
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_sequence_parallelism
=
True
)
assert
"enable_sequence_parallelism is deprecated"
in
caplog_vllm
.
text
assert
config
.
enable_sp
is
True
assert
config
.
enable_sequence_parallelism
is
True
# Test enable_async_tp -> fuse_gemm_comms
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_async_tp
=
True
)
assert
"enable_async_tp is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_gemm_comms
is
True
assert
config
.
enable_async_tp
is
True
# Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
caplog_vllm
.
clear
()
config
=
PassConfig
(
enable_fi_allreduce_fusion
=
True
)
assert
"enable_fi_allreduce_fusion is deprecated"
in
caplog_vllm
.
text
assert
config
.
fuse_allreduce_rms
is
True
assert
config
.
enable_fi_allreduce_fusion
is
True
# Test hash consistency
config_old
=
PassConfig
(
enable_fusion
=
True
)
config_new
=
PassConfig
(
fuse_norm_quant
=
True
,
fuse_act_quant
=
True
)
assert
config_old
.
compute_hash
()
==
config_new
.
compute_hash
()
config_old
=
PassConfig
(
enable_async_tp
=
True
)
config_new
=
PassConfig
(
fuse_gemm_comms
=
True
)
assert
config_old
.
compute_hash
()
==
config_new
.
compute_hash
()
tests/compile/test_dynamic_shapes_compilation.py
View file @
a3f8d5dd
...
@@ -36,7 +36,7 @@ def get_test_models():
...
@@ -36,7 +36,7 @@ def get_test_models():
DynamicShapesType
.
BACKED_SIZE_OBLIVIOUS
,
DynamicShapesType
.
BACKED_SIZE_OBLIVIOUS
,
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"use_aot_compile"
,
[
"0"
])
@
pytest
.
mark
.
parametrize
(
"use_aot_compile"
,
[
"0"
,
"1"
])
@
pytest
.
mark
.
parametrize
(
"use_bytecode_hook"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bytecode_hook"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"evaluate_guards"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"evaluate_guards"
,
[
False
,
True
])
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
...
@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
...
@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
if
use_bytecode_hook
and
shapes_type
==
DynamicShapesType
.
UNBACKED
:
if
use_bytecode_hook
and
shapes_type
==
DynamicShapesType
.
UNBACKED
:
pytest
.
skip
(
"UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0"
)
pytest
.
skip
(
"UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0"
)
if
evaluate_guards
and
shapes_type
==
DynamicShapesType
.
UNBACKED
:
pytest
.
skip
(
"unbacked dynamic shapes do not add guards"
)
if
evaluate_guards
and
use_aot_compile
:
pytest
.
skip
(
"evaluate_guards requires use_aot_compile=0"
)
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
use_aot_compile
)
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
use_aot_compile
)
monkeypatch
.
setenv
(
"VLLM_USE_BYTECODE_HOOK"
,
"1"
if
use_bytecode_hook
else
"0"
)
monkeypatch
.
setenv
(
"VLLM_USE_BYTECODE_HOOK"
,
"1"
if
use_bytecode_hook
else
"0"
)
...
@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
...
@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
and
dynamic_shapes_type
==
DynamicShapesType
.
BACKED
and
dynamic_shapes_type
==
DynamicShapesType
.
BACKED
and
evaluate_guards
and
evaluate_guards
):
):
pytest
.
skip
(
"evaluate_guards for backed does not work with aot_compile
=1"
)
pytest
.
skip
(
"evaluate_guards for backed does not work with aot_compile=1"
)
@
support_torch_compile
@
support_torch_compile
class
ModelWithSizeCheck
(
torch
.
nn
.
Module
):
class
ModelWithSizeCheck
(
torch
.
nn
.
Module
):
...
...
tests/compile/test_functionalization.py
View file @
a3f8d5dd
...
@@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module):
...
@@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module):
class
TestRotaryEmbedding
(
torch
.
nn
.
Module
):
class
TestRotaryEmbedding
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
head_dim
=
64
,
rotary_dim
=
None
,
max_position
=
2048
,
base
=
10000
):
def
__init__
(
self
,
head_dim
=
64
,
max_position
=
2048
,
base
=
10000
):
super
().
__init__
()
super
().
__init__
()
self
.
head_dim
=
head_dim
self
.
head_dim
=
head_dim
self
.
rotary_dim
=
rotary_dim
or
head_dim
self
.
rotary_emb
=
get_rope
(
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
self
.
head_dim
,
rotary_dim
=
self
.
rotary_dim
,
max_position
=
max_position
,
max_position
=
max_position
,
rope_parameters
=
{
"rope_type"
:
"default"
,
"rope_theta"
:
base
},
rope_parameters
=
{
"rope_type"
:
"default"
,
"rope_theta"
:
base
},
)
)
...
@@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
...
@@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position
,
max_position
=
max_position
,
rope_parameters
=
{
"rope_type"
:
"default"
,
"rope_theta"
:
base
},
rope_parameters
=
{
"rope_type"
:
"default"
,
"rope_theta"
:
base
},
)
)
...
...
tests/conftest.py
View file @
a3f8d5dd
...
@@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
...
@@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
@
pytest
.
fixture
def
workspace_init
():
"""Initialize the workspace manager for tests that need it.
This fixture initializes the workspace manager with a CUDA device
if available, and resets it after the test completes. Tests that
create a full vLLM engine should NOT use this fixture as the engine
will initialize the workspace manager itself.
"""
from
vllm.v1.worker.workspace
import
(
init_workspace_manager
,
reset_workspace_manager
,
)
if
torch
.
cuda
.
is_available
():
device
=
torch
.
device
(
"cuda:0"
)
init_workspace_manager
(
device
)
yield
reset_workspace_manager
()
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
dynamo_reset
():
def
dynamo_reset
():
yield
yield
...
@@ -681,10 +702,16 @@ class HfRunner:
...
@@ -681,10 +702,16 @@ class HfRunner:
**
kwargs
,
**
kwargs
,
)
)
# Encoder-decoder models return decoder_hidden_states instead of
# hidden_states
hidden_states
=
(
getattr
(
output
,
"hidden_states"
,
None
)
or
output
.
decoder_hidden_states
)
(
(
seq_logprobs_lst
,
seq_logprobs_lst
,
output_len
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
output
.
hidden_states
,
num_logprobs
)
)
=
self
.
_hidden_states_to_logprobs
(
hidden_states
,
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
output
.
sequences
[
0
]
seq_ids
=
output
.
sequences
[
0
]
...
@@ -741,7 +768,7 @@ class VllmRunner:
...
@@ -741,7 +768,7 @@ class VllmRunner:
tokenizer_name
:
str
|
None
=
None
,
tokenizer_name
:
str
|
None
=
None
,
tokenizer_mode
:
str
=
"auto"
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
True
,
trust_remote_code
:
bool
=
True
,
seed
:
int
|
None
=
0
,
seed
:
int
=
0
,
max_model_len
:
int
|
None
=
1024
,
max_model_len
:
int
|
None
=
1024
,
dtype
:
str
=
"auto"
,
dtype
:
str
=
"auto"
,
disable_log_stats
:
bool
=
True
,
disable_log_stats
:
bool
=
True
,
...
...
Prev
1
2
3
4
5
6
7
8
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment