Commit a3f8d5dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

parents 8d75f22e f34eca5f
......@@ -77,7 +77,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
......
......@@ -158,7 +158,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
......
......@@ -158,7 +158,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
......
......@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
)
def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "ByteDance-Seed/BAGEL-7B-MoT"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
)
prompts = [
(
f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_example_map = {
"aria": run_aria,
"aya_vision": run_aya_vision,
"bagel": run_bagel,
"bee": run_bee,
"blip-2": run_blip2,
"chameleon": run_chameleon,
......@@ -2031,7 +2058,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
......
......@@ -1382,7 +1382,7 @@ def run_generate(
model,
question: str,
image_urls: list[str],
seed: int | None,
seed: int,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
......@@ -1416,7 +1416,7 @@ def run_chat(
model: str,
question: str,
image_urls: list[str],
seed: int | None,
seed: int,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
......@@ -1494,7 +1494,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(
......
......@@ -21,7 +21,7 @@
# --worker \
# /abs/path/to/huggingface/cache \
# -e VLLM_HOST_IP=<worker_node_ip>
#
#
# Each worker requires a unique VLLM_HOST_IP value.
# Keep each terminal session open. Closing a session stops the associated Ray
# node and thereby shuts down the entire cluster.
......@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
exit 1
fi
# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
VLLM_HOST_IP=""
for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
arg="${ADDITIONAL_ARGS[$i]}"
case "${arg}" in
-e)
next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
if [[ "${next}" == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
break
fi
;;
-eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
VLLM_HOST_IP="${arg#*=}"
break
;;
esac
done
# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
echo "Using VLLM_HOST_IP as the head node address."
HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
fi
fi
# Generate a unique container name with random suffix.
# Docker container names must be unique on each host.
# The random suffix allows multiple Ray containers to run simultaneously on the same machine,
......@@ -74,36 +102,17 @@ cleanup() {
trap cleanup EXIT
# Build the Ray start command based on the node role.
# The head node manages the cluster and accepts connections on port 6379,
# The head node manages the cluster and accepts connections on port 6379,
# while workers connect to the head's address.
RAY_START_CMD="ray start --block"
if [ "${NODE_TYPE}" == "--head" ]; then
RAY_START_CMD+=" --head --port=6379"
RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
else
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
fi
# Parse VLLM_HOST_IP from additional args if present.
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
VLLM_HOST_IP=""
for arg in "${ADDITIONAL_ARGS[@]}"; do
if [[ $arg == "-e" ]]; then
continue
fi
if [[ $arg == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
break
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
if [ -n "${VLLM_HOST_IP}" ]; then
RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
fi
done
# Build Ray IP environment variables if VLLM_HOST_IP is set.
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
RAY_IP_VARS=()
if [ -n "${VLLM_HOST_IP}" ]; then
RAY_IP_VARS=(
-e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
)
fi
# Launch the container with the assembled parameters.
......@@ -118,6 +127,5 @@ docker run \
--shm-size 10.24g \
--gpus all \
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
"${RAY_IP_VARS[@]}" \
"${ADDITIONAL_ARGS[@]}" \
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
......@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"messages": [
{
"role": "user",
"content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
}
],
"extra_body": {
......
......@@ -16,7 +16,7 @@ import requests
# - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model-impl terratorch
# --task embed --trust-remote-code
# --trust-remote-code
# --skip-tokenizer-init --enforce-eager
# --io-processor-plugin terratorch_segmentation
# --enable-mm-embeds
......
......@@ -305,7 +305,7 @@ def get_query(modality: QueryModality):
raise ValueError(msg)
def run_encode(model: str, modality: QueryModality, seed: int | None):
def run_encode(model: str, modality: QueryModality, seed: int):
query = get_query(modality)
req_data = model_example_map[model](query)
......@@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None):
print("-" * 50)
def run_score(model: str, modality: QueryModality, seed: int | None):
def run_score(model: str, modality: QueryModality, seed: int):
query = get_query(modality)
req_data = model_example_map[model](query)
......@@ -390,7 +390,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
......
......@@ -51,6 +51,7 @@ hooks:
- docs/mkdocs/hooks/remove_announcement.py
- docs/mkdocs/hooks/generate_examples.py
- docs/mkdocs/hooks/generate_argparse.py
- docs/mkdocs/hooks/generate_metrics.py
- docs/mkdocs/hooks/url_schemes.py
plugins:
......
......@@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser
setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0
model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file
model-hosting-container-standards >= 0.1.9, < 1.0.0
mcp
\ No newline at end of file
......@@ -75,7 +75,7 @@ torchgeo==0.7.0
mteb==2.1.2
# Data processing
xgrammar==0.1.27
xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
# Test async scheduling
# Utilities
......
......@@ -23,14 +23,6 @@ class TestParameterSweepItem:
{"compilation_config.use_inductor_graph_partition": True},
"--compilation-config.use_inductor_graph_partition=true",
),
(
{"compilation_config.use_inductor": False},
"--compilation-config.use_inductor=false",
),
(
{"compilation_config.use_inductor": True},
"--compilation-config.use_inductor=true",
),
],
)
def test_nested_boolean_params(self, input_dict, expected):
......
......@@ -20,13 +20,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import flat_product, multi_gpu_test
is_blackwell = lambda: current_platform.is_device_capability(100)
is_blackwell = lambda: current_platform.is_device_capability_family(100)
"""Are we running on Blackwell, a lot of tests depend on it"""
class Matches(NamedTuple):
attention_fusion: int = 0
allreduce_fusion: int = 0
rms_quant_norm_fusion: int = 0
sequence_parallel: int = 0
async_tp: int = 0
......@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple):
MODELS_FP8: list[ModelBackendTestCase] = []
MODELS_FP4: list[ModelBackendTestCase] = []
MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
MODELS: list[ModelBackendTestCase] = [] # tp-only
if current_platform.is_cuda():
......@@ -138,6 +140,17 @@ elif current_platform.is_rocm():
CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
def has_cuda_graph_wrapper_metadata() -> bool:
from importlib import import_module
try:
module = import_module("torch._inductor.utils")
module.CUDAGraphWrapperMetadata # noqa B018
except AttributeError:
return False
return True
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, matches, custom_ops",
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
......@@ -145,7 +158,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
# quant_fp4 only has the custom impl
+ list(flat_product(MODELS_FP4, [""])),
)
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
@pytest.mark.parametrize(
"inductor_graph_partition",
[
pytest.param(
True,
marks=pytest.mark.skipif(
not has_cuda_graph_wrapper_metadata(),
reason="This test requires"
"torch._inductor.utils.CUDAGraphWrapperMetadata to run",
),
),
False,
],
)
def test_attn_quant(
model_name: str,
model_kwargs: dict[str, Any],
......@@ -474,3 +500,81 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
compilation_config.compile_ranges_split_points = (
llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
)
if current_platform.is_cuda():
MODELS_GROUP_FP8 = [
ModelBackendTestCase(
model_name="Qwen/Qwen3-30B-A3B-FP8",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.TRITON_ATTN,
matches=Matches(
rms_quant_norm_fusion=48,
),
),
]
CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, matches, custom_ops",
# Test rms norm+group quant_fp8 fusion
list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
)
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
# TODO: remove skip after we fix the fusion thoroughly
@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
def test_rms_group_quant(
model_name: str,
model_kwargs: dict[str, Any],
backend: AttentionBackendEnum,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
monkeypatch,
):
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition requires torch>=2.9")
custom_ops_list = custom_ops.split(",") if custom_ops else []
if inductor_graph_partition:
mode = CUDAGraphMode.FULL_AND_PIECEWISE
splitting_ops: list[str] | None = None
else:
mode = CUDAGraphMode.FULL_DECODE_ONLY
splitting_ops = []
# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
compilation_config = CompilationConfig(
# Testing properties
custom_ops=custom_ops_list,
use_inductor_graph_partition=inductor_graph_partition,
cudagraph_mode=mode,
splitting_ops=splitting_ops,
# Common
mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
# Inductor caches custom passes by default as well via uuid
inductor_compile_config={"force_disable_caches": True},
)
with caplog_mp_spawn(logging.DEBUG) as log_holder:
run_model(compilation_config, model_name, **model_kwargs)
log_matches = re.findall(
r"\[fusion.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(log_matches) == 1, log_holder.text
assert int(log_matches[0]) == matches.rms_quant_norm_fusion
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import logging
from contextlib import nullcontext
from unittest.mock import patch
......@@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
from vllm.config.compilation import CompilationMode, PassConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.logger import _print_warning_once
from vllm.platforms import current_platform
from vllm.utils.torch_utils import _is_torch_equal_or_newer
......@@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
pass_config={"enable_attn_fusion": True, "enable_noop": True},
pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
custom_ops=["+quant_fp8"],
cudagraph_mode=CUDAGraphMode.PIECEWISE,
),
......@@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init(
vllm_config.compilation_config.max_cudagraph_capture_size
== expected_max_size
)
def test_pass_config_deprecation(caplog_vllm):
caplog_vllm.set_level(logging.WARNING)
# Clear cache to ensure warnings are re-issued
_print_warning_once.cache_clear()
# Test enable_fusion -> fuse_norm_quant, fuse_act_quant
caplog_vllm.clear()
config = PassConfig(enable_fusion=True)
assert "enable_fusion is deprecated" in caplog_vllm.text
assert config.fuse_norm_quant is True
assert config.fuse_act_quant is True
assert config.enable_fusion is True
# Test enable_attn_fusion -> fuse_attn_quant
caplog_vllm.clear()
config = PassConfig(enable_attn_fusion=True)
assert "enable_attn_fusion is deprecated" in caplog_vllm.text
assert config.fuse_attn_quant is True
assert config.enable_attn_fusion is True
# Test enable_noop -> eliminate_noops
caplog_vllm.clear()
config = PassConfig(enable_noop=True)
assert "enable_noop is deprecated" in caplog_vllm.text
assert config.eliminate_noops is True
assert config.enable_noop is True
# Test enable_sequence_parallelism -> enable_sp
caplog_vllm.clear()
config = PassConfig(enable_sequence_parallelism=True)
assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
assert config.enable_sp is True
assert config.enable_sequence_parallelism is True
# Test enable_async_tp -> fuse_gemm_comms
caplog_vllm.clear()
config = PassConfig(enable_async_tp=True)
assert "enable_async_tp is deprecated" in caplog_vllm.text
assert config.fuse_gemm_comms is True
assert config.enable_async_tp is True
# Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
caplog_vllm.clear()
config = PassConfig(enable_fi_allreduce_fusion=True)
assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
assert config.fuse_allreduce_rms is True
assert config.enable_fi_allreduce_fusion is True
# Test hash consistency
config_old = PassConfig(enable_fusion=True)
config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
assert config_old.compute_hash() == config_new.compute_hash()
config_old = PassConfig(enable_async_tp=True)
config_new = PassConfig(fuse_gemm_comms=True)
assert config_old.compute_hash() == config_new.compute_hash()
......@@ -36,7 +36,7 @@ def get_test_models():
DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
],
)
@pytest.mark.parametrize("use_aot_compile", ["0"])
@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True])
@pytest.mark.skipif(
......@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("unbacked dynamic shapes do not add guards")
if evaluate_guards and use_aot_compile:
pytest.skip("evaluate_guards requires use_aot_compile=0")
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
......@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
and dynamic_shapes_type == DynamicShapesType.BACKED
and evaluate_guards
):
pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
pytest.skip("evaluate_guards for backed does not work with aot_compile=1")
@support_torch_compile
class ModelWithSizeCheck(torch.nn.Module):
......
......@@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module):
class TestRotaryEmbedding(torch.nn.Module):
def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
def __init__(self, head_dim=64, max_position=2048, base=10000):
super().__init__()
self.head_dim = head_dim
self.rotary_dim = rotary_dim or head_dim
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.rotary_dim,
max_position=max_position,
rope_parameters={"rope_type": "default", "rope_theta": base},
)
......@@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
rope_parameters={"rope_type": "default", "rope_theta": base},
)
......
......@@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup_dist_env_and_memory()
@pytest.fixture
def workspace_init():
"""Initialize the workspace manager for tests that need it.
This fixture initializes the workspace manager with a CUDA device
if available, and resets it after the test completes. Tests that
create a full vLLM engine should NOT use this fixture as the engine
will initialize the workspace manager itself.
"""
from vllm.v1.worker.workspace import (
init_workspace_manager,
reset_workspace_manager,
)
if torch.cuda.is_available():
device = torch.device("cuda:0")
init_workspace_manager(device)
yield
reset_workspace_manager()
@pytest.fixture(autouse=True)
def dynamo_reset():
yield
......@@ -681,10 +702,16 @@ class HfRunner:
**kwargs,
)
# Encoder-decoder models return decoder_hidden_states instead of
# hidden_states
hidden_states = (
getattr(output, "hidden_states", None) or output.decoder_hidden_states
)
(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)
all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0]
......@@ -741,7 +768,7 @@ class VllmRunner:
tokenizer_name: str | None = None,
tokenizer_mode: str = "auto",
trust_remote_code: bool = True,
seed: int | None = 0,
seed: int = 0,
max_model_len: int | None = 1024,
dtype: str = "auto",
disable_log_stats: bool = True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment