"vscode:/vscode.git/clone" did not exist on "b522c4476fcdaee254fe40fefb354a4908fccac5"
Commit a3f8d5dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

parents 8d75f22e f34eca5f
...@@ -77,7 +77,7 @@ def parse_args(): ...@@ -77,7 +77,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--seed", "--seed",
type=int, type=int,
default=None, default=0,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
return parser.parse_args() return parser.parse_args()
......
...@@ -158,7 +158,7 @@ def parse_args(): ...@@ -158,7 +158,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--seed", "--seed",
type=int, type=int,
default=None, default=0,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
......
...@@ -158,7 +158,7 @@ def parse_args(): ...@@ -158,7 +158,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--seed", "--seed",
type=int, type=int,
default=None, default=0,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
......
...@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData: ...@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
) )
def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "ByteDance-Seed/BAGEL-7B-MoT"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
)
prompts = [
(
f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# BLIP-2 # BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData: def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_example_map = { model_example_map = {
"aria": run_aria, "aria": run_aria,
"aya_vision": run_aya_vision, "aya_vision": run_aya_vision,
"bagel": run_bagel,
"bee": run_bee, "bee": run_bee,
"blip-2": run_blip2, "blip-2": run_blip2,
"chameleon": run_chameleon, "chameleon": run_chameleon,
...@@ -2031,7 +2058,7 @@ def parse_args(): ...@@ -2031,7 +2058,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--seed", "--seed",
type=int, type=int,
default=None, default=0,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
......
...@@ -1382,7 +1382,7 @@ def run_generate( ...@@ -1382,7 +1382,7 @@ def run_generate(
model, model,
question: str, question: str,
image_urls: list[str], image_urls: list[str],
seed: int | None, seed: int,
tensor_parallel_size: int | None, tensor_parallel_size: int | None,
): ):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
...@@ -1416,7 +1416,7 @@ def run_chat( ...@@ -1416,7 +1416,7 @@ def run_chat(
model: str, model: str,
question: str, question: str,
image_urls: list[str], image_urls: list[str],
seed: int | None, seed: int,
tensor_parallel_size: int | None, tensor_parallel_size: int | None,
): ):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
...@@ -1494,7 +1494,7 @@ def parse_args(): ...@@ -1494,7 +1494,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--seed", "--seed",
type=int, type=int,
default=None, default=0,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
parser.add_argument( parser.add_argument(
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
# --worker \ # --worker \
# /abs/path/to/huggingface/cache \ # /abs/path/to/huggingface/cache \
# -e VLLM_HOST_IP=<worker_node_ip> # -e VLLM_HOST_IP=<worker_node_ip>
# #
# Each worker requires a unique VLLM_HOST_IP value. # Each worker requires a unique VLLM_HOST_IP value.
# Keep each terminal session open. Closing a session stops the associated Ray # Keep each terminal session open. Closing a session stops the associated Ray
# node and thereby shuts down the entire cluster. # node and thereby shuts down the entire cluster.
...@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then ...@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
exit 1 exit 1
fi fi
# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
VLLM_HOST_IP=""
for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
arg="${ADDITIONAL_ARGS[$i]}"
case "${arg}" in
-e)
next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
if [[ "${next}" == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
break
fi
;;
-eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
VLLM_HOST_IP="${arg#*=}"
break
;;
esac
done
# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
echo "Using VLLM_HOST_IP as the head node address."
HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
fi
fi
# Generate a unique container name with random suffix. # Generate a unique container name with random suffix.
# Docker container names must be unique on each host. # Docker container names must be unique on each host.
# The random suffix allows multiple Ray containers to run simultaneously on the same machine, # The random suffix allows multiple Ray containers to run simultaneously on the same machine,
...@@ -74,36 +102,17 @@ cleanup() { ...@@ -74,36 +102,17 @@ cleanup() {
trap cleanup EXIT trap cleanup EXIT
# Build the Ray start command based on the node role. # Build the Ray start command based on the node role.
# The head node manages the cluster and accepts connections on port 6379, # The head node manages the cluster and accepts connections on port 6379,
# while workers connect to the head's address. # while workers connect to the head's address.
RAY_START_CMD="ray start --block" RAY_START_CMD="ray start --block"
if [ "${NODE_TYPE}" == "--head" ]; then if [ "${NODE_TYPE}" == "--head" ]; then
RAY_START_CMD+=" --head --port=6379" RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
else else
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
fi
# Parse VLLM_HOST_IP from additional args if present. RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings. if [ -n "${VLLM_HOST_IP}" ]; then
VLLM_HOST_IP="" RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
for arg in "${ADDITIONAL_ARGS[@]}"; do
if [[ $arg == "-e" ]]; then
continue
fi
if [[ $arg == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
break
fi fi
done
# Build Ray IP environment variables if VLLM_HOST_IP is set.
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
RAY_IP_VARS=()
if [ -n "${VLLM_HOST_IP}" ]; then
RAY_IP_VARS=(
-e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
)
fi fi
# Launch the container with the assembled parameters. # Launch the container with the assembled parameters.
...@@ -118,6 +127,5 @@ docker run \ ...@@ -118,6 +127,5 @@ docker run \
--shm-size 10.24g \ --shm-size 10.24g \
--gpus all \ --gpus all \
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
"${RAY_IP_VARS[@]}" \
"${ADDITIONAL_ARGS[@]}" \ "${ADDITIONAL_ARGS[@]}" \
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}" "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
...@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { ...@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
"content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.", "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
} }
], ],
"extra_body": { "extra_body": {
......
...@@ -16,7 +16,7 @@ import requests ...@@ -16,7 +16,7 @@ import requests
# - start vllm in serving mode with the below args # - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model-impl terratorch # --model-impl terratorch
# --task embed --trust-remote-code # --trust-remote-code
# --skip-tokenizer-init --enforce-eager # --skip-tokenizer-init --enforce-eager
# --io-processor-plugin terratorch_segmentation # --io-processor-plugin terratorch_segmentation
# --enable-mm-embeds # --enable-mm-embeds
......
...@@ -305,7 +305,7 @@ def get_query(modality: QueryModality): ...@@ -305,7 +305,7 @@ def get_query(modality: QueryModality):
raise ValueError(msg) raise ValueError(msg)
def run_encode(model: str, modality: QueryModality, seed: int | None): def run_encode(model: str, modality: QueryModality, seed: int):
query = get_query(modality) query = get_query(modality)
req_data = model_example_map[model](query) req_data = model_example_map[model](query)
...@@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None): ...@@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None):
print("-" * 50) print("-" * 50)
def run_score(model: str, modality: QueryModality, seed: int | None): def run_score(model: str, modality: QueryModality, seed: int):
query = get_query(modality) query = get_query(modality)
req_data = model_example_map[model](query) req_data = model_example_map[model](query)
...@@ -390,7 +390,7 @@ def parse_args(): ...@@ -390,7 +390,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--seed", "--seed",
type=int, type=int,
default=None, default=0,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
return parser.parse_args() return parser.parse_args()
......
...@@ -51,6 +51,7 @@ hooks: ...@@ -51,6 +51,7 @@ hooks:
- docs/mkdocs/hooks/remove_announcement.py - docs/mkdocs/hooks/remove_announcement.py
- docs/mkdocs/hooks/generate_examples.py - docs/mkdocs/hooks/generate_examples.py
- docs/mkdocs/hooks/generate_argparse.py - docs/mkdocs/hooks/generate_argparse.py
- docs/mkdocs/hooks/generate_metrics.py
- docs/mkdocs/hooks/url_schemes.py - docs/mkdocs/hooks/url_schemes.py
plugins: plugins:
......
...@@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser ...@@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser
setproctitle # Used to set process names for better debugging and monitoring setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0 anthropic == 0.71.0
model-hosting-container-standards >= 0.1.9, < 1.0.0 model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file mcp
\ No newline at end of file
...@@ -75,7 +75,7 @@ torchgeo==0.7.0 ...@@ -75,7 +75,7 @@ torchgeo==0.7.0
mteb==2.1.2 mteb==2.1.2
# Data processing # Data processing
xgrammar==0.1.27 xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
# Test async scheduling # Test async scheduling
# Utilities # Utilities
......
...@@ -23,14 +23,6 @@ class TestParameterSweepItem: ...@@ -23,14 +23,6 @@ class TestParameterSweepItem:
{"compilation_config.use_inductor_graph_partition": True}, {"compilation_config.use_inductor_graph_partition": True},
"--compilation-config.use_inductor_graph_partition=true", "--compilation-config.use_inductor_graph_partition=true",
), ),
(
{"compilation_config.use_inductor": False},
"--compilation-config.use_inductor=false",
),
(
{"compilation_config.use_inductor": True},
"--compilation-config.use_inductor=true",
),
], ],
) )
def test_nested_boolean_params(self, input_dict, expected): def test_nested_boolean_params(self, input_dict, expected):
......
...@@ -20,13 +20,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer ...@@ -20,13 +20,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import flat_product, multi_gpu_test from ...utils import flat_product, multi_gpu_test
is_blackwell = lambda: current_platform.is_device_capability(100) is_blackwell = lambda: current_platform.is_device_capability_family(100)
"""Are we running on Blackwell, a lot of tests depend on it""" """Are we running on Blackwell, a lot of tests depend on it"""
class Matches(NamedTuple): class Matches(NamedTuple):
attention_fusion: int = 0 attention_fusion: int = 0
allreduce_fusion: int = 0 allreduce_fusion: int = 0
rms_quant_norm_fusion: int = 0
sequence_parallel: int = 0 sequence_parallel: int = 0
async_tp: int = 0 async_tp: int = 0
...@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple): ...@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple):
MODELS_FP8: list[ModelBackendTestCase] = [] MODELS_FP8: list[ModelBackendTestCase] = []
MODELS_FP4: list[ModelBackendTestCase] = [] MODELS_FP4: list[ModelBackendTestCase] = []
MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
MODELS: list[ModelBackendTestCase] = [] # tp-only MODELS: list[ModelBackendTestCase] = [] # tp-only
if current_platform.is_cuda(): if current_platform.is_cuda():
...@@ -138,6 +140,17 @@ elif current_platform.is_rocm(): ...@@ -138,6 +140,17 @@ elif current_platform.is_rocm():
CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
def has_cuda_graph_wrapper_metadata() -> bool:
from importlib import import_module
try:
module = import_module("torch._inductor.utils")
module.CUDAGraphWrapperMetadata # noqa B018
except AttributeError:
return False
return True
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name, model_kwargs, backend, matches, custom_ops", "model_name, model_kwargs, backend, matches, custom_ops",
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
...@@ -145,7 +158,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] ...@@ -145,7 +158,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
# quant_fp4 only has the custom impl # quant_fp4 only has the custom impl
+ list(flat_product(MODELS_FP4, [""])), + list(flat_product(MODELS_FP4, [""])),
) )
@pytest.mark.parametrize("inductor_graph_partition", [True, False]) @pytest.mark.parametrize(
"inductor_graph_partition",
[
pytest.param(
True,
marks=pytest.mark.skipif(
not has_cuda_graph_wrapper_metadata(),
reason="This test requires"
"torch._inductor.utils.CUDAGraphWrapperMetadata to run",
),
),
False,
],
)
def test_attn_quant( def test_attn_quant(
model_name: str, model_name: str,
model_kwargs: dict[str, Any], model_kwargs: dict[str, Any],
...@@ -474,3 +500,81 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg ...@@ -474,3 +500,81 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
compilation_config.compile_ranges_split_points = ( compilation_config.compile_ranges_split_points = (
llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
) )
if current_platform.is_cuda():
MODELS_GROUP_FP8 = [
ModelBackendTestCase(
model_name="Qwen/Qwen3-30B-A3B-FP8",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.TRITON_ATTN,
matches=Matches(
rms_quant_norm_fusion=48,
),
),
]
CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, matches, custom_ops",
# Test rms norm+group quant_fp8 fusion
list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
)
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
# TODO: remove skip after we fix the fusion thoroughly
@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
def test_rms_group_quant(
model_name: str,
model_kwargs: dict[str, Any],
backend: AttentionBackendEnum,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
monkeypatch,
):
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition requires torch>=2.9")
custom_ops_list = custom_ops.split(",") if custom_ops else []
if inductor_graph_partition:
mode = CUDAGraphMode.FULL_AND_PIECEWISE
splitting_ops: list[str] | None = None
else:
mode = CUDAGraphMode.FULL_DECODE_ONLY
splitting_ops = []
# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
compilation_config = CompilationConfig(
# Testing properties
custom_ops=custom_ops_list,
use_inductor_graph_partition=inductor_graph_partition,
cudagraph_mode=mode,
splitting_ops=splitting_ops,
# Common
mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
# Inductor caches custom passes by default as well via uuid
inductor_compile_config={"force_disable_caches": True},
)
with caplog_mp_spawn(logging.DEBUG) as log_holder:
run_model(compilation_config, model_name, **model_kwargs)
log_matches = re.findall(
r"\[fusion.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(log_matches) == 1, log_holder.text
assert int(log_matches[0]) == matches.rms_quant_norm_fusion
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy import copy
import logging
from contextlib import nullcontext from contextlib import nullcontext
from unittest.mock import patch from unittest.mock import patch
...@@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass ...@@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
from vllm.config.compilation import CompilationMode, PassConfig from vllm.config.compilation import CompilationMode, PassConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.logger import _print_warning_once
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import _is_torch_equal_or_newer from vllm.utils.torch_utils import _is_torch_equal_or_newer
...@@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor(): ...@@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
), ),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
pass_config={"enable_attn_fusion": True, "enable_noop": True}, pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],
cudagraph_mode=CUDAGraphMode.PIECEWISE, cudagraph_mode=CUDAGraphMode.PIECEWISE,
), ),
...@@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init( ...@@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init(
vllm_config.compilation_config.max_cudagraph_capture_size vllm_config.compilation_config.max_cudagraph_capture_size
== expected_max_size == expected_max_size
) )
def test_pass_config_deprecation(caplog_vllm):
caplog_vllm.set_level(logging.WARNING)
# Clear cache to ensure warnings are re-issued
_print_warning_once.cache_clear()
# Test enable_fusion -> fuse_norm_quant, fuse_act_quant
caplog_vllm.clear()
config = PassConfig(enable_fusion=True)
assert "enable_fusion is deprecated" in caplog_vllm.text
assert config.fuse_norm_quant is True
assert config.fuse_act_quant is True
assert config.enable_fusion is True
# Test enable_attn_fusion -> fuse_attn_quant
caplog_vllm.clear()
config = PassConfig(enable_attn_fusion=True)
assert "enable_attn_fusion is deprecated" in caplog_vllm.text
assert config.fuse_attn_quant is True
assert config.enable_attn_fusion is True
# Test enable_noop -> eliminate_noops
caplog_vllm.clear()
config = PassConfig(enable_noop=True)
assert "enable_noop is deprecated" in caplog_vllm.text
assert config.eliminate_noops is True
assert config.enable_noop is True
# Test enable_sequence_parallelism -> enable_sp
caplog_vllm.clear()
config = PassConfig(enable_sequence_parallelism=True)
assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
assert config.enable_sp is True
assert config.enable_sequence_parallelism is True
# Test enable_async_tp -> fuse_gemm_comms
caplog_vllm.clear()
config = PassConfig(enable_async_tp=True)
assert "enable_async_tp is deprecated" in caplog_vllm.text
assert config.fuse_gemm_comms is True
assert config.enable_async_tp is True
# Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
caplog_vllm.clear()
config = PassConfig(enable_fi_allreduce_fusion=True)
assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
assert config.fuse_allreduce_rms is True
assert config.enable_fi_allreduce_fusion is True
# Test hash consistency
config_old = PassConfig(enable_fusion=True)
config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
assert config_old.compute_hash() == config_new.compute_hash()
config_old = PassConfig(enable_async_tp=True)
config_new = PassConfig(fuse_gemm_comms=True)
assert config_old.compute_hash() == config_new.compute_hash()
...@@ -36,7 +36,7 @@ def get_test_models(): ...@@ -36,7 +36,7 @@ def get_test_models():
DynamicShapesType.BACKED_SIZE_OBLIVIOUS, DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
], ],
) )
@pytest.mark.parametrize("use_aot_compile", ["0"]) @pytest.mark.parametrize("use_aot_compile", ["0", "1"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False]) @pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True]) @pytest.mark.parametrize("evaluate_guards", [False, True])
@pytest.mark.skipif( @pytest.mark.skipif(
...@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation( ...@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED: if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0") pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("unbacked dynamic shapes do not add guards")
if evaluate_guards and use_aot_compile:
pytest.skip("evaluate_guards requires use_aot_compile=0")
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile) monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0") monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
...@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards( ...@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
and dynamic_shapes_type == DynamicShapesType.BACKED and dynamic_shapes_type == DynamicShapesType.BACKED
and evaluate_guards and evaluate_guards
): ):
pytest.skip("evaluate_guards for backed does not work with aot_compile =1") pytest.skip("evaluate_guards for backed does not work with aot_compile=1")
@support_torch_compile @support_torch_compile
class ModelWithSizeCheck(torch.nn.Module): class ModelWithSizeCheck(torch.nn.Module):
......
...@@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module): ...@@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module):
class TestRotaryEmbedding(torch.nn.Module): class TestRotaryEmbedding(torch.nn.Module):
def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000): def __init__(self, head_dim=64, max_position=2048, base=10000):
super().__init__() super().__init__()
self.head_dim = head_dim self.head_dim = head_dim
self.rotary_dim = rotary_dim or head_dim
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_dim, self.head_dim,
rotary_dim=self.rotary_dim,
max_position=max_position, max_position=max_position,
rope_parameters={"rope_type": "default", "rope_theta": base}, rope_parameters={"rope_type": "default", "rope_theta": base},
) )
...@@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module): ...@@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_dim, self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
rope_parameters={"rope_type": "default", "rope_theta": base}, rope_parameters={"rope_type": "default", "rope_theta": base},
) )
......
...@@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): ...@@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
@pytest.fixture
def workspace_init():
"""Initialize the workspace manager for tests that need it.
This fixture initializes the workspace manager with a CUDA device
if available, and resets it after the test completes. Tests that
create a full vLLM engine should NOT use this fixture as the engine
will initialize the workspace manager itself.
"""
from vllm.v1.worker.workspace import (
init_workspace_manager,
reset_workspace_manager,
)
if torch.cuda.is_available():
device = torch.device("cuda:0")
init_workspace_manager(device)
yield
reset_workspace_manager()
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def dynamo_reset(): def dynamo_reset():
yield yield
...@@ -681,10 +702,16 @@ class HfRunner: ...@@ -681,10 +702,16 @@ class HfRunner:
**kwargs, **kwargs,
) )
# Encoder-decoder models return decoder_hidden_states instead of
# hidden_states
hidden_states = (
getattr(output, "hidden_states", None) or output.decoder_hidden_states
)
( (
seq_logprobs_lst, seq_logprobs_lst,
output_len, output_len,
) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs) ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)
all_logprobs.append(seq_logprobs_lst) all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0] seq_ids = output.sequences[0]
...@@ -741,7 +768,7 @@ class VllmRunner: ...@@ -741,7 +768,7 @@ class VllmRunner:
tokenizer_name: str | None = None, tokenizer_name: str | None = None,
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
trust_remote_code: bool = True, trust_remote_code: bool = True,
seed: int | None = 0, seed: int = 0,
max_model_len: int | None = 1024, max_model_len: int | None = 1024,
dtype: str = "auto", dtype: str = "auto",
disable_log_stats: bool = True, disable_log_stats: bool = True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment