"components/vscode:/vscode.git/clone" did not exist on "da0f2fb84a3b42eda95cc320ce9e35d4e35c85c7"
Commit a99300bd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

parents cc3e01c7 5438967f
...@@ -21,7 +21,6 @@ runai-model-streamer-s3==0.11.0 ...@@ -21,7 +21,6 @@ runai-model-streamer-s3==0.11.0
# conch-triton-kernels==1.2.1 # numpy>=1.26.4 # conch-triton-kernels==1.2.1 # numpy>=1.26.4
numa numa
python-multipart
pytrie pytrie
setuptools_scm>=8 setuptools_scm>=8
cmake==3.29 cmake==3.29
...@@ -30,4 +29,6 @@ torch == 2.5.1 ...@@ -30,4 +29,6 @@ torch == 2.5.1
triton == 3.0.0 triton == 3.0.0
flash_attn == 2.6.1 flash_attn == 2.6.1
flash_mla == 1.0.0 flash_mla == 1.0.0
lmslim == 0.3.1 lightop == 0.5.0
\ No newline at end of file lmslim == 0.3.1
...@@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests ...@@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
timm >=1.0.17 # required for internvl and gemma3n-mm test timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.7.1 torch==2.8.0
torchaudio==2.7.1 torchaudio==2.8.0
torchvision==0.22.1 torchvision==0.23.0
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test mistral_common[image,audio] >= 1.8.2 # required for voxtral test
...@@ -32,9 +32,10 @@ num2words # required for smolvlm test ...@@ -32,9 +32,10 @@ num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test # TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.55.0 transformers==4.55.2
tokenizers==0.21.1 tokenizers==0.21.1
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
...@@ -53,3 +54,4 @@ runai-model-streamer-s3==0.11.0 ...@@ -53,3 +54,4 @@ runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10 fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10 pydantic>=2.10 # 2.9 leads to error on python 3.10
terratorch==1.1rc2 # required for PrithviMAE test terratorch==1.1rc2 # required for PrithviMAE test
decord==0.6.0
...@@ -156,6 +156,8 @@ datasets==3.0.2 ...@@ -156,6 +156,8 @@ datasets==3.0.2
# mteb # mteb
decorator==5.1.1 decorator==5.1.1
# via librosa # via librosa
decord==0.6.0
# via -r requirements/test.in
dill==0.3.8 dill==0.3.8
# via # via
# datasets # datasets
...@@ -408,7 +410,7 @@ lightning-utilities==0.14.3 ...@@ -408,7 +410,7 @@ lightning-utilities==0.14.3
# torchmetrics # torchmetrics
llvmlite==0.44.0 llvmlite==0.44.0
# via numba # via numba
lm-eval==0.4.8 lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# via -r requirements/test.in # via -r requirements/test.in
lxml==5.3.0 lxml==5.3.0
# via # via
...@@ -493,6 +495,7 @@ numpy==1.26.4 ...@@ -493,6 +495,7 @@ numpy==1.26.4
# contourpy # contourpy
# cupy-cuda12x # cupy-cuda12x
# datasets # datasets
# decord
# einx # einx
# encodec # encodec
# evaluate # evaluate
...@@ -538,42 +541,42 @@ numpy==1.26.4 ...@@ -538,42 +541,42 @@ numpy==1.26.4
# tritonclient # tritonclient
# vocos # vocos
# xarray # xarray
nvidia-cublas-cu12==12.8.3.14 nvidia-cublas-cu12==12.8.4.1
# via # via
# nvidia-cudnn-cu12 # nvidia-cudnn-cu12
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# torch # torch
nvidia-cuda-cupti-cu12==12.8.57 nvidia-cuda-cupti-cu12==12.8.90
# via torch # via torch
nvidia-cuda-nvrtc-cu12==12.8.61 nvidia-cuda-nvrtc-cu12==12.8.93
# via torch # via torch
nvidia-cuda-runtime-cu12==12.8.57 nvidia-cuda-runtime-cu12==12.8.90
# via torch # via torch
nvidia-cudnn-cu12==9.7.1.26 nvidia-cudnn-cu12==9.10.2.21
# via torch # via torch
nvidia-cufft-cu12==11.3.3.41 nvidia-cufft-cu12==11.3.3.83
# via torch # via torch
nvidia-cufile-cu12==1.13.0.11 nvidia-cufile-cu12==1.13.1.3
# via torch # via torch
nvidia-curand-cu12==10.3.9.55 nvidia-curand-cu12==10.3.9.90
# via torch # via torch
nvidia-cusolver-cu12==11.7.2.55 nvidia-cusolver-cu12==11.7.3.90
# via torch # via torch
nvidia-cusparse-cu12==12.5.7.53 nvidia-cusparse-cu12==12.5.8.93
# via # via
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# torch # torch
nvidia-cusparselt-cu12==0.6.3 nvidia-cusparselt-cu12==0.7.1
# via torch # via torch
nvidia-nccl-cu12==2.26.2 nvidia-nccl-cu12==2.27.3
# via torch # via torch
nvidia-nvjitlink-cu12==12.8.61 nvidia-nvjitlink-cu12==12.8.93
# via # via
# nvidia-cufft-cu12 # nvidia-cufft-cu12
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# nvidia-cusparse-cu12 # nvidia-cusparse-cu12
# torch # torch
nvidia-nvtx-cu12==12.8.55 nvidia-nvtx-cu12==12.8.90
# via torch # via torch
omegaconf==2.3.0 omegaconf==2.3.0
# via # via
...@@ -742,7 +745,7 @@ pycparser==2.22 ...@@ -742,7 +745,7 @@ pycparser==2.22
# via cffi # via cffi
pycryptodomex==3.22.0 pycryptodomex==3.22.0
# via blobfile # via blobfile
pydantic==2.11.5 pydantic==2.11.7
# via # via
# -r requirements/test.in # -r requirements/test.in
# albumentations # albumentations
...@@ -1066,7 +1069,7 @@ tomli==2.2.1 ...@@ -1066,7 +1069,7 @@ tomli==2.2.1
# via schemathesis # via schemathesis
tomli-w==1.2.0 tomli-w==1.2.0
# via schemathesis # via schemathesis
torch==2.7.1+cu128 torch==2.8.0+cu128
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
...@@ -1095,7 +1098,7 @@ torch==2.7.1+cu128 ...@@ -1095,7 +1098,7 @@ torch==2.7.1+cu128
# torchvision # torchvision
# vector-quantize-pytorch # vector-quantize-pytorch
# vocos # vocos
torchaudio==2.7.1+cu128 torchaudio==2.8.0+cu128
# via # via
# -r requirements/test.in # -r requirements/test.in
# encodec # encodec
...@@ -1108,7 +1111,7 @@ torchmetrics==1.7.4 ...@@ -1108,7 +1111,7 @@ torchmetrics==1.7.4
# pytorch-lightning # pytorch-lightning
# terratorch # terratorch
# torchgeo # torchgeo
torchvision==0.22.1+cu128 torchvision==0.23.0+cu128
# via # via
# -r requirements/test.in # -r requirements/test.in
# lightly # lightly
...@@ -1139,7 +1142,7 @@ tqdm==4.66.6 ...@@ -1139,7 +1142,7 @@ tqdm==4.66.6
# transformers # transformers
tqdm-multiprocess==0.0.11 tqdm-multiprocess==0.0.11
# via lm-eval # via lm-eval
transformers==4.55.0 transformers==4.55.2
# via # via
# -r requirements/test.in # -r requirements/test.in
# genai-perf # genai-perf
...@@ -1149,7 +1152,7 @@ transformers==4.55.0 ...@@ -1149,7 +1152,7 @@ transformers==4.55.0
# transformers-stream-generator # transformers-stream-generator
transformers-stream-generator==0.0.5 transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.3.1 triton==3.4.0
# via torch # via torch
tritonclient==2.51.0 tritonclient==2.51.0
# via # via
......
...@@ -11,6 +11,7 @@ ray[default] ...@@ -11,6 +11,7 @@ ray[default]
ray[data] ray[data]
setuptools==78.1.0 setuptools==78.1.0
nixl==0.3.0 nixl==0.3.0
tpu_info==0.4.0
# Install torch_xla # Install torch_xla
--pre --pre
......
...@@ -550,8 +550,8 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -550,8 +550,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f""" new_version_content = f"""
try: try:
__version__ = "0.10.1.1" __version__ = "0.10.2.rc1"
__version_tuple__ = (0, 10, 1, 1) __version_tuple__ = (0, 10, 2, rc1)
__hcu_version__ = f'0.10.1.1+{version}' __hcu_version__ = f'0.10.1.1+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__ from vllm.version import __version__, __version_tuple__, __hcu_version__
...@@ -765,16 +765,25 @@ if envs.VLLM_USE_PRECOMPILED: ...@@ -765,16 +765,25 @@ if envs.VLLM_USE_PRECOMPILED:
if wheel_location is not None: if wheel_location is not None:
wheel_url = wheel_location wheel_url = wheel_location
else: else:
import platform
arch = platform.machine()
if arch == "x86_64":
wheel_tag = "manylinux1_x86_64"
elif arch == "aarch64":
wheel_tag = "manylinux2014_aarch64"
else:
raise ValueError(f"Unsupported architecture: {arch}")
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
from urllib.request import urlopen from urllib.request import urlopen
try: try:
with urlopen(wheel_url) as resp: with urlopen(wheel_url) as resp:
if resp.status != 200: if resp.status != 200:
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" wheel_url = nightly_wheel_url
except Exception as e: except Exception as e:
print(f"[warn] Falling back to nightly wheel: {e}") print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" wheel_url = nightly_wheel_url
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url) wheel_url)
...@@ -807,7 +816,9 @@ setup( ...@@ -807,7 +816,9 @@ setup(
"mistral_common[audio]"], # Required for audio processing "mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility "video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile # FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.2.11"], "flashinfer": ["flashinfer-python==0.2.14.post1"],
# Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"],
}, },
cmdclass=cmdclass, cmdclass=cmdclass,
package_data=package_data, package_data=package_data,
......
...@@ -12,7 +12,6 @@ import pytest ...@@ -12,7 +12,6 @@ import pytest
import torch import torch
from vllm import LLM, envs from vllm import LLM, envs
from vllm.platforms import current_platform
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
from ..conftest import HfRunner, VllmRunner from ..conftest import HfRunner, VllmRunner
...@@ -83,11 +82,7 @@ def test_models( ...@@ -83,11 +82,7 @@ def test_models(
"VLLM_USE_V1") and envs.VLLM_USE_V1: "VLLM_USE_V1") and envs.VLLM_USE_V1:
pytest.skip("enable_prompt_embeds is not supported in v1.") pytest.skip("enable_prompt_embeds is not supported in v1.")
if backend == "FLASHINFER" and current_platform.is_rocm(): if backend == "XFORMERS" and model == os.path.join(models_path_prefix, "google/gemma-2-2b-it"):
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend in ("XFORMERS",
"FLASHINFER") and model == os.path.join(models_path_prefix, "google/gemma-2-2b-it"):
pytest.skip( pytest.skip(
f"{backend} does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
...@@ -146,6 +141,7 @@ def test_models( ...@@ -146,6 +141,7 @@ def test_models(
) )
# @multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
# @pytest.mark.parametrize( # @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, " # "model, distributed_executor_backend, attention_backend, "
...@@ -162,8 +158,6 @@ def test_models( ...@@ -162,8 +158,6 @@ def test_models(
# ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}), # ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
# ("distilbert/distilgpt2", "ray", "", "A100", {}), # ("distilbert/distilgpt2", "ray", "", "A100", {}),
# ("distilbert/distilgpt2", "mp", "", "A100", {}), # ("distilbert/distilgpt2", "mp", "", "A100", {}),
# ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
# ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
# ]) # ])
# @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) # @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
# def test_models_distributed( # def test_models_distributed(
......
...@@ -176,4 +176,35 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): ...@@ -176,4 +176,35 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
output3 = llm.generate(prompt, sampling_params) output3 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text assert output[0].outputs[0].text == output3[0].outputs[0].text
\ No newline at end of file
@create_new_process_for_each_test()
def test_deep_sleep():
model = "Qwen/Qwen3-0.6B"
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# Put the engine to deep sleep
llm.sleep(level=2)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
assert used_bytes < 3 * GiB_bytes
llm.wake_up(tags=["weights"])
llm.collective_rpc("reload_weights")
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
assert used_bytes < 4 * GiB_bytes
# now allocate kv cache and cuda graph memory
llm.wake_up(tags=["kv_cache"])
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
from typing import Any, NamedTuple, Optional, cast
import numpy as np
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.benchmarks.datasets import (RandomDataset, RandomMultiModalDataset,
SampleRequest)
@pytest.fixture(scope="session")
def hf_tokenizer() -> PreTrainedTokenizerBase:
# Use a small, commonly available tokenizer
return AutoTokenizer.from_pretrained("gpt2")
class Params(NamedTuple):
num_requests: int
prefix_len: int
range_ratio: float
input_len: int
output_len: int
@pytest.fixture(scope="session")
def random_dataset_params() -> Params:
return Params(num_requests=16,
prefix_len=7,
range_ratio=0.3,
input_len=50,
output_len=20)
def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
"""Project a SampleRequest into a comparable tuple."""
return (req.prompt, req.prompt_len, req.expected_output_len)
def _collect_samples(dataset: RandomDataset,
tokenizer: PreTrainedTokenizerBase,
num_requests: int = 16,
prefix_len: int = 7,
range_ratio: float = 0.3,
input_len: int = 50,
output_len: int = 20) -> list[tuple[str, int, int]]:
samples = dataset.sample(
tokenizer=tokenizer,
num_requests=num_requests,
prefix_len=prefix_len,
range_ratio=range_ratio,
input_len=input_len,
output_len=output_len,
)
return [_fingerprint_sample(s) for s in samples]
@pytest.mark.benchmark
def test_random_dataset_same_seed(
hf_tokenizer: PreTrainedTokenizerBase,
random_dataset_params: Params) -> None:
"""Same seed should yield identical outputs, even if global RNGs change.
This guards against accidental reliance on Python's random or np.random
in RandomDataset after moving to numpy.default_rng.
"""
p = random_dataset_params
common_seed = 123
dataset_a = RandomDataset(random_seed=common_seed)
dataset_b = RandomDataset(random_seed=common_seed)
a = _collect_samples(dataset_a,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len)
# Perturb global RNG state to ensure isolation
random.seed(999)
_ = [random.random() for _ in range(100)]
np.random.seed(888)
_ = [np.random.random() for _ in range(100)]
b = _collect_samples(dataset_b,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len)
assert a == b
@pytest.mark.benchmark
def test_random_dataset_different_seeds(
hf_tokenizer: PreTrainedTokenizerBase,
random_dataset_params: Params) -> None:
"""Different seeds should change outputs with overwhelming likelihood."""
p = random_dataset_params
seed_a = 0
dataset_a = RandomDataset(random_seed=seed_a)
a = _collect_samples(dataset_a,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len)
seed_b = 999
dataset_b = RandomDataset(random_seed=seed_b)
# Perturb global RNG with same seed as dataset_a to ensure isolation
random.seed(seed_a)
np.random.seed(seed_a)
b = _collect_samples(dataset_b,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len)
assert a != b
# -----------------------------
# RandomMultiModalDataset tests
# -----------------------------
def _mm_fingerprint_sample(
req: SampleRequest,
) -> tuple[str, int, int, int, list[str]]:
"""Create a compact fingerprint for multimodal samples.
Includes:
- prompt string
- prompt_len
- expected_output_len
- count of multimodal items
- per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
"""
items = req.multi_modal_data or []
item_prefixes: list[str] = []
for it in items:
if isinstance(it, dict) and it.get("type") == "image_url":
url = it.get("image_url", {}).get("url", "")
# Only keep a short identifying prefix to avoid huge strings
item_prefixes.append(f"image:{url[:22]}")
elif isinstance(it, dict) and it.get("type") == "video_url":
url = it.get("video_url", {}).get("url", "")
item_prefixes.append(f"video:{url[:22]}")
else:
item_prefixes.append("unknown:")
return (req.prompt, req.prompt_len, req.expected_output_len, len(items),
item_prefixes)
def _collect_mm_samples(
dataset: RandomMultiModalDataset,
tokenizer: PreTrainedTokenizerBase,
*,
num_requests: int = 8,
prefix_len: int = 3,
range_ratio: float = 0.0,
input_len: int = 20,
output_len: int = 5,
base_items_per_request: int = 2,
num_mm_items_range_ratio: float = 0.0,
limit_mm_per_prompt: Optional[dict[str, int]] = None,
bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
enable_multimodal_chat: bool = False,
) -> list[SampleRequest]:
if limit_mm_per_prompt is None:
limit_mm_per_prompt = {"image": 5, "video": 0}
if bucket_config is None:
bucket_config = {(32, 32, 1): 0.5, (52, 64, 1): 0.5}
return dataset.sample(
tokenizer=tokenizer,
num_requests=num_requests,
prefix_len=prefix_len,
range_ratio=range_ratio,
input_len=input_len,
output_len=output_len,
base_items_per_request=base_items_per_request,
num_mm_items_range_ratio=num_mm_items_range_ratio,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
enable_multimodal_chat=enable_multimodal_chat,
)
@pytest.mark.benchmark
def test_random_mm_same_seed(hf_tokenizer: PreTrainedTokenizerBase) -> None:
seed = 42
ds_a = RandomMultiModalDataset(random_seed=seed)
ds_b = RandomMultiModalDataset(random_seed=seed)
a = _collect_mm_samples(ds_a, hf_tokenizer)
b = _collect_mm_samples(ds_b, hf_tokenizer)
fa = [_mm_fingerprint_sample(s) for s in a]
fb = [_mm_fingerprint_sample(s) for s in b]
assert fa == fb
@pytest.mark.benchmark
def test_random_mm_different_seeds(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds_a = RandomMultiModalDataset(random_seed=0)
ds_b = RandomMultiModalDataset(random_seed=999)
a = _collect_mm_samples(ds_a, hf_tokenizer)
b = _collect_mm_samples(ds_b, hf_tokenizer)
fa = [_mm_fingerprint_sample(s) for s in a]
fb = [_mm_fingerprint_sample(s) for s in b]
assert fa != fb
@pytest.mark.benchmark
def test_random_mm_respects_limits(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# Requesting 3 items with a per-prompt limit of 1 should error per current
# design (dataset refuses to silently clamp below the requested baseline).
with pytest.raises(ValueError):
_collect_mm_samples(
ds,
hf_tokenizer,
num_requests=12,
base_items_per_request=3,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 1, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
@pytest.mark.benchmark
def test_random_mm_zero_prob_entries_are_removed(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# Second bucket has zero probability and should be ignored after
# normalization
samples = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=6,
base_items_per_request=2,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 10, "video": 0},
bucket_config={(32, 32, 1): 1.0, (52, 64, 1): 0.0},
)
for s in samples:
assert isinstance(s.multi_modal_data, list)
typed_mm = cast(list[dict[str, Any]], s.multi_modal_data)
for it in typed_mm:
assert it.get("type") == "image_url"
@pytest.mark.benchmark
def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
ds = RandomMultiModalDataset(random_seed=0)
samples = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=5,
base_items_per_request=0,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 5, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
for s in samples:
assert s.multi_modal_data == []
@pytest.mark.benchmark
def test_random_mm_num_items_per_prompt(
hf_tokenizer: PreTrainedTokenizerBase) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# Fixed number of images per prompt
# set num_mm_items_range_ratio to 0.0
# TODO: modify video values when video sampling is implemented
samples_fixed_items = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=5,
base_items_per_request=3,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 3, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
# Must have 5 requests each with 3 mm items per prompt
assert len(samples_fixed_items) == 5
for s in samples_fixed_items:
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
assert len(mm_data) == 3
for it in mm_data:
assert it.get("type") == "image_url"
@pytest.mark.benchmark
def test_random_mm_bucket_config_not_mutated(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# This bucket config is not normalized to sum to 1
# and has more buckets than requested images
original = {(32, 32, 1): 0.2, (52, 64, 1): 6, (25, 64, 1): 3}
# Keep a snapshot to compare after sampling
snapshot = dict(original)
_ = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=4,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 1, "video": 0},
bucket_config=original,
)
# Ensure the original dict content is unchanged
assert original == snapshot
# Vary number of mm items per prompt
# set num_mm_items_range_ratio to 0.5
samples_varying_items = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=5,
base_items_per_request=2,
num_mm_items_range_ratio=0.5,
limit_mm_per_prompt={"image": 4, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
# Must have 5 requests each with less than 4 mm items per prompt
# but at least 1 mm item per prompt
assert len(samples_varying_items) == 5
for s in samples_varying_items:
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
assert len(mm_data) <= 4
assert len(mm_data) >= 1
for it in mm_data:
assert it.get("type") == "image_url"
...@@ -12,10 +12,9 @@ from vllm.compilation.backends import set_model_tag ...@@ -12,10 +12,9 @@ from vllm.compilation.backends import set_model_tag
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import (ignore_torch_compile, from vllm.compilation.decorators import (ignore_torch_compile,
support_torch_compile) support_torch_compile)
from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
set_current_vllm_config) VllmConfig, set_current_vllm_config)
from vllm.envs import VLLM_USE_V1 from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.forward_context import set_forward_context
from vllm.utils import direct_register_custom_op from vllm.utils import direct_register_custom_op
# create a library to hold the custom op # create a library to hold the custom op
...@@ -164,104 +163,34 @@ class SimpleModelWithTwoGraphs(ParentModel): ...@@ -164,104 +163,34 @@ class SimpleModelWithTwoGraphs(ParentModel):
return x return x
def test_ignore_torch_compile_decorator():
assert VLLM_USE_V1
# piecewise
vllm_config = VllmConfig(compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
splitting_ops=["silly.attention"],
cudagraph_capture_sizes=[1, 2],
))
@support_torch_compile
class A(nn.Module):
def __init__(self,
*,
vllm_config: VllmConfig,
prefix: str = '',
**kwargs) -> None:
super().__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x + x
attn_output = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, attn_output)
x = attn_output
x = x * 3
return x
@ignore_torch_compile
class B(A):
...
@support_torch_compile
class C(B):
...
with set_current_vllm_config(vllm_config):
mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
# A has support_torch_compile
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=3,
num_piecewise_capturable_graphs_seen=2,
num_backend_compilations=2,
num_cudagraph_captured=4,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
), set_forward_context({}, vllm_config=vllm_config):
# first run is for compile
mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
# run cudagraph captured sizes
mod_A(torch.randn(2, MLP_SIZE).cuda())
mod_A(torch.randn(1, MLP_SIZE).cuda())
with set_current_vllm_config(vllm_config):
mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
# B's ignore_torch_compile should override A's support_torch_compile
with compilation_counter.expect(
num_graphs_seen=0,
num_piecewise_graphs_seen=0,
num_piecewise_capturable_graphs_seen=0,
num_backend_compilations=0,
num_cudagraph_captured=0,
), set_forward_context({}, vllm_config=vllm_config):
mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
mod_B(torch.randn(2, MLP_SIZE).cuda())
mod_B(torch.randn(1, MLP_SIZE).cuda())
with set_current_vllm_config(vllm_config):
mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
# C's support_torch_compile should override B's ignore_torch_compile
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=3,
num_piecewise_capturable_graphs_seen=2,
num_backend_compilations=2,
num_cudagraph_captured=4,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
), set_forward_context({}, vllm_config=vllm_config):
mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
mod_C(torch.randn(2, MLP_SIZE).cuda())
mod_C(torch.randn(1, MLP_SIZE).cuda())
@torch.inference_mode @torch.inference_mode
def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor): def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor,
cudagraph_runtime_mode: CUDAGraphMode):
with set_forward_context({}, vllm_config=vllm_config): with set_forward_context({}, vllm_config=vllm_config):
# First run is for compile # warmup for the model with cudagraph_mode NONE
model(inputs) model(inputs)
# Run CUDAGraph captured sizes # simulate cudagraphs capturing
model(inputs[:2]) with set_forward_context({},
model(inputs[:1]) vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
output = model(inputs[:2]) batch_descriptor=BatchDescriptor(
num_tokens=2, )):
model(inputs[:2])
with set_forward_context({},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=1, )):
model(inputs[:1])
# simulate cudagraphs replay
with set_forward_context({},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=2, )):
output = model(inputs[:2])
output = output.cpu() output = output.cpu()
return output.cpu() return output.cpu()
...@@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): ...@@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
splitting_ops=["silly.attention"], splitting_ops=["silly.attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
)) ))
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
...@@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal(): ...@@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal():
num_cudagraph_captured=8, num_cudagraph_captured=8,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
): ):
outputs.append(run_model(vllm_config, model, inputs)) outputs.append(
run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
# no compile or cudagraph # no compile or cudagraph
vllm_config = VllmConfig(compilation_config=CompilationConfig( vllm_config = VllmConfig(compilation_config=CompilationConfig(
level=CompilationLevel.NO_COMPILATION, )) level=CompilationLevel.NO_COMPILATION, ))
cudagraph_runtime_mode = CUDAGraphMode.NONE
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
...@@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): ...@@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
num_backend_compilations=0, num_backend_compilations=0,
num_cudagraph_captured=0, num_cudagraph_captured=0,
): ):
outputs.append(run_model(vllm_config, model, inputs)) outputs.append(
run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
# piecewise compile without CUDA graph # piecewise compile without CUDA graph
vllm_config = VllmConfig(compilation_config=CompilationConfig( vllm_config = VllmConfig(compilation_config=CompilationConfig(
...@@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): ...@@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
use_cudagraph=False, use_cudagraph=False,
splitting_ops=["silly.attention"], splitting_ops=["silly.attention"],
)) ))
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
...@@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): ...@@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
num_backend_compilations=4, num_backend_compilations=4,
num_cudagraph_captured=0, # no cudagraph captured num_cudagraph_captured=0, # no cudagraph captured
): ):
outputs.append(run_model(vllm_config, model, inputs)) outputs.append(
run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
# Generally don't expect outputs with and without inductor # Generally don't expect outputs with and without inductor
# to be bitwise equivalent # to be bitwise equivalent
......
...@@ -30,15 +30,15 @@ class TestSetting: ...@@ -30,15 +30,15 @@ class TestSetting:
"test_setting", "test_setting",
[ [
# basic llama model # basic llama model
# TestSetting( TestSetting(
# model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
# model_args=["--max-model-len", "2048"], model_args=["--max-model-len", "2048"],
# pp_size=2, pp_size=2,
# tp_size=2, tp_size=2,
# attn_backend="FLASHINFER", attn_backend="FLASH_ATTN",
# method="generate", method="generate",
# fullgraph=True, fullgraph=True,
# ), ),
# llama model with quantization # llama model with quantization
TestSetting( TestSetting(
model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"), model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from torch import nn
from torch.library import Library
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import (ignore_torch_compile,
support_torch_compile)
from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
CUDAGraphMode, VllmConfig, set_current_vllm_config)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import direct_register_custom_op
# create a library to hold the custom op
silly_lib = Library("silly", "FRAGMENT") # noqa
BATCH_SIZE = 32
MLP_SIZE = 128
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
out.copy_(q)
out += k
out += v
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
return
direct_register_custom_op(
op_name="attention",
op_func=silly_attention,
mutates_args=["out"],
fake_impl=silly_attention_fake,
target_lib=silly_lib,
)
@torch.inference_mode
def run_model(vllm_config: VllmConfig, model: nn.Module,
cudagraph_runtime_mode: CUDAGraphMode):
with set_forward_context({}, vllm_config=vllm_config):
# warmup for the model with cudagraph_mode NONE
model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
# simulate cudagraphs capturing
with set_forward_context({},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=2, )):
model(torch.randn(2, MLP_SIZE).cuda())
with set_forward_context({},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=1, )):
model(torch.randn(1, MLP_SIZE).cuda())
# simulate cudagraphs replay
with set_forward_context({},
vllm_config=vllm_config,
cudagraph_runtime_mode=cudagraph_runtime_mode,
batch_descriptor=BatchDescriptor(
num_tokens=2, )):
output = model(torch.randn(2, MLP_SIZE).cuda())
output = output.cpu()
return output.cpu()
def test_ignore_torch_compile_decorator():
# piecewise
vllm_config = VllmConfig(compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
splitting_ops=["silly.attention"],
cudagraph_capture_sizes=[1, 2],
))
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@support_torch_compile
class A(nn.Module):
def __init__(self,
*,
vllm_config: VllmConfig,
prefix: str = '',
**kwargs) -> None:
super().__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x + x
attn_output = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, attn_output)
x = attn_output
x = x * 3
return x
@ignore_torch_compile
class B(A):
...
@support_torch_compile
class C(B):
...
with set_current_vllm_config(vllm_config):
mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
# A has support_torch_compile
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=3,
num_piecewise_capturable_graphs_seen=2,
num_backend_compilations=2,
num_cudagraph_captured=4,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
with set_current_vllm_config(vllm_config):
mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
# B's ignore_torch_compile should override A's support_torch_compile
with compilation_counter.expect(
num_graphs_seen=0,
num_piecewise_graphs_seen=0,
num_piecewise_capturable_graphs_seen=0,
num_backend_compilations=0,
num_cudagraph_captured=0,
):
run_model(vllm_config, mod_B, cudagraph_runtime_mode)
with set_current_vllm_config(vllm_config):
mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
# C's support_torch_compile should override B's ignore_torch_compile
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=3,
num_piecewise_capturable_graphs_seen=2,
num_backend_compilations=2,
num_cudagraph_captured=4,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model(vllm_config, mod_C, cudagraph_runtime_mode)
# Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=True
@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
kv_sharing_fast_prefill)
class B(nn.Module):
def __init__(self,
*,
vllm_config: VllmConfig,
prefix: str = '',
**kwargs) -> None:
super().__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x + x
attn_output = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, attn_output)
x = attn_output
x = x + x
return x
# Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=False
@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
cache_config.kv_sharing_fast_prefill)
class A(nn.Module):
def __init__(self,
*,
vllm_config: VllmConfig,
prefix: str = '',
**kwargs) -> None:
super().__init__()
self.mod1 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
self.mod2 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.mod1(x)
attn_output = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, attn_output)
x = attn_output
x = self.mod2(x)
return x
def test_conditional_compile_enable_if():
vllm_config = VllmConfig(cache_config=CacheConfig(
kv_sharing_fast_prefill=True, ),
compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
splitting_ops=["silly.attention"],
cudagraph_capture_sizes=[1, 2],
))
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
with set_current_vllm_config(vllm_config):
mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
# A has support_torch_compile but enable_if fn returns False
# enalbe_if will be True for B, so we expect mod1 and mod2
# to be compiled
with compilation_counter.expect(
num_graphs_seen=2,
num_piecewise_graphs_seen=6,
# 3 piecewise graphs per instance of B()
num_piecewise_capturable_graphs_seen=4,
num_backend_compilations=4,
num_cudagraph_captured=8,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
# Set kv_sharing_fast_prefill=False
# which will cause A to be compiled and B to not be compiled
vllm_config = VllmConfig(cache_config=CacheConfig(
kv_sharing_fast_prefill=False, ),
compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
splitting_ops=["silly.attention"],
cudagraph_capture_sizes=[1, 2],
))
with set_current_vllm_config(vllm_config):
mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=7,
# 3 attn ops and 4 non-attn ops
num_piecewise_capturable_graphs_seen=4,
num_backend_compilations=4,
num_cudagraph_captured=8,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
...@@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): ...@@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
"quantization": "gptq_marlin_24" "quantization": "gptq_marlin_24"
})) }))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"): if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ" "quantization": "AWQ"
......
...@@ -148,7 +148,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): ...@@ -148,7 +148,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
@pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seq_len", [8]) @pytest.mark.parametrize("seq_len", [8])
@pytest.mark.parametrize("hidden_size", [16]) @pytest.mark.parametrize("hidden_size", [16])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
reason="Only test on CUDA") reason="Only test on CUDA")
@pytest.mark.skipif( @pytest.mark.skipif(
......
...@@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module): ...@@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module):
# Initialize weights # Initialize weights
torch.nn.init.normal_(self.gate_proj, std=0.02) torch.nn.init.normal_(self.gate_proj, std=0.02)
self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True, self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
use_per_token_if_dynamic=False)
self.scale = torch.rand(1, dtype=torch.float32) self.scale = torch.rand(1, dtype=torch.float32)
# Create a weight that is compatible with torch._scaled_mm, # Create a weight that is compatible with torch._scaled_mm,
......
...@@ -8,11 +8,12 @@ import vllm.envs as envs ...@@ -8,11 +8,12 @@ import vllm.envs as envs
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey, from vllm.compilation.fusion import FUSED_OPS, FusionPass
kFp8DynamicTokenSym, kFp8StaticTensorSym)
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.config import CompilationConfig, PassConfig, VllmConfig
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym)
from .backend import TestBackend from .backend import TestBackend
......
...@@ -7,13 +7,15 @@ import torch ...@@ -7,13 +7,15 @@ import torch
import vllm.envs as envs import vllm.envs as envs
import vllm.plugins import vllm.plugins
from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey, from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
FusionPass, GroupShape, QuantKey) FusionPass)
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import (CompilationConfig, CompilationLevel, PassConfig, from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
VllmConfig) VllmConfig)
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, QuantKey, ScaleDesc)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity) Fp8LinearOp, maybe_create_device_identity)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .backend import TestBackend from .backend import TestBackend
...@@ -24,16 +26,14 @@ FP8_DTYPE = current_platform.fp8_dtype() ...@@ -24,16 +26,14 @@ FP8_DTYPE = current_platform.fp8_dtype()
class TestModel(torch.nn.Module): class TestModel(torch.nn.Module):
def __init__(self, hidden_size: int, eps: float, static: bool, def __init__(self, hidden_size: int, eps: float, static: bool,
cutlass_fp8_enabled: bool, *args, **kwargs): force_fp8_e4m3fnuz: bool, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.cutlass_fp8_enabled = cutlass_fp8_enabled self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
self.key = QuantKey(dtype=FP8_DTYPE, quant_scale = ScaleDesc(torch.float32, static, group_shape)
static=static, self.key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
group_shape=group_shape,
symmetric=True)
if static: if static:
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
else: else:
...@@ -43,7 +43,7 @@ class TestModel(torch.nn.Module): ...@@ -43,7 +43,7 @@ class TestModel(torch.nn.Module):
for _ in range(2) for _ in range(2)
] ]
self.fp8_linear = Fp8LinearOp( self.fp8_linear = Fp8LinearOp(
cutlass_fp8_supported=cutlass_fp8_enabled, force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
act_quant_static=static, act_quant_static=static,
act_quant_group_shape=group_shape, act_quant_group_shape=group_shape,
) )
...@@ -81,12 +81,11 @@ class TestModel(torch.nn.Module): ...@@ -81,12 +81,11 @@ class TestModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
@pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("static", [True, False]) @pytest.mark.parametrize("static", [True, False])
@pytest.mark.parametrize("cutlass_fp8_enabled", @pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
[True, False] if CUTLASS_FP8_SUPPORTED else [False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
reason="Only test on CUDA and ROCm") reason="Only test on CUDA and ROCm")
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
cutlass_fp8_enabled): force_fp8_e4m3fnuz):
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
torch.manual_seed(1) torch.manual_seed(1)
...@@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, ...@@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
fusion_pass = FusionPass.instance(vllm_config) fusion_pass = FusionPass.instance(vllm_config)
backend = TestBackend(noop_pass, fusion_pass) backend = TestBackend(noop_pass, fusion_pass)
model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled) model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
# First dimension dynamic # First dimension dynamic
x = torch.rand(num_tokens, hidden_size) x = torch.rand(num_tokens, hidden_size)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from typing import Optional from typing import Optional
import pytest import pytest
...@@ -7,13 +8,29 @@ import torch._dynamo ...@@ -7,13 +8,29 @@ import torch._dynamo
from tests.compile.backend import TestBackend from tests.compile.backend import TestBackend
from tests.models.utils import check_outputs_equal from tests.models.utils import check_outputs_equal
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.compilation.fusion import QUANT_OPS, QuantKey, kFp8StaticTensorSym from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.attention import Attention
from vllm.attention.selector import global_force_attn_backend_context_manager
from vllm.compilation.fusion import QUANT_OPS
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.fx_utils import find_op_nodes
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig, CompilationLevel, VllmConfig from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
ModelConfig, PassConfig, SchedulerConfig, VllmConfig,
set_current_vllm_config)
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
Fp8LinearOp)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.kv_cache_interface import AttentionSpec
FP8_DTYPE = current_platform.fp8_dtype()
FP4_DTYPE = torch.uint8
# globals needed for string-import custom Dynamo backend field # globals needed for string-import custom Dynamo backend field
backend: Optional[TestBackend] = None backend: Optional[TestBackend] = None
...@@ -90,9 +107,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str, ...@@ -90,9 +107,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
# check support # check support
attn_fusion_supported = [ attn_fusion_supported = [
layer.impl.fused_output_quant_supported(quant_key.dtype, layer.impl.fused_output_quant_supported(quant_key)
quant_key.static,
quant_key.group_shape)
for key, layer in compile_config.static_forward_context.items() for key, layer in compile_config.static_forward_context.items()
] ]
...@@ -132,3 +147,309 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str, ...@@ -132,3 +147,309 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
# Reset backend to make sure llm2 gets released # Reset backend to make sure llm2 gets released
backend = None backend = None
class AttentionQuantPatternModel(torch.nn.Module):
"""Base model for AttentionQuantPattern fusion."""
def __init__(self, num_qo_heads: int, num_kv_heads: int, head_size: int,
kv_cache_dtype: torch.dtype, device: torch.device,
vllm_config: VllmConfig, **kwargs):
super().__init__()
self.num_qo_heads = num_qo_heads
self.num_kv_heads = num_kv_heads
self.head_size = head_size
self.kv_cache_dtype = kv_cache_dtype
self.device = device
self.vllm_config = vllm_config
self.attn = Attention(
num_heads=self.num_qo_heads,
head_size=self.head_size,
scale=1.0 / (self.head_size**0.5),
num_kv_heads=self.num_kv_heads,
cache_config=vllm_config.cache_config,
prefix="model.layers.0.self_attn.attn",
)
self.block_size = 16
# Initialize attn MetadataBuilder
self.builder = self.attn.attn_backend.get_builder_cls()(
kv_cache_spec=AttentionSpec(
block_size=self.block_size,
num_kv_heads=self.num_kv_heads,
head_size=self.head_size,
dtype=self.kv_cache_dtype,
use_mla=False,
),
layer_names=[self.attn.layer_name],
vllm_config=self.vllm_config,
device=self.device,
)
def build_attn_metadata(self, batch_size: int):
"""Initialize attention metadata."""
# Create common attn metadata
batch_spec = BatchSpec(seq_lens=[1] * batch_size,
query_lens=[1] * batch_size)
common_attn_metadata = create_common_attn_metadata(
batch_spec,
self.block_size,
self.device,
arange_block_indices=True)
max_blocks = (max(batch_spec.seq_lens) + self.block_size -
1) // self.block_size
num_blocks = batch_size * max_blocks
# Create dummy KV cache for FlashInfer TRTLLM
# - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
# - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
# Create kv_cache in HND layout and permute to NHD layout
# (later will be permuted back to HND layout in forward pass)
kv_cache = torch.zeros(num_blocks,
2,
self.num_kv_heads,
self.block_size,
self.head_size,
dtype=self.kv_cache_dtype,
device=self.device)
kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
self.attn.kv_cache = [kv_cache]
# Build attn metadata
self.attn_metadata = self.builder.build(
common_prefix_len=0, common_attn_metadata=common_attn_metadata)
return self.attn_metadata
class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
"""Test model for AttentionFp8StaticQuantPattern fusion."""
quant_key = kFp8StaticTensorSym
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fp8_linear = Fp8LinearOp(
act_quant_static=self.quant_key.scale.static,
act_quant_group_shape=self.quant_key.scale.group_shape)
hidden_size = self.num_qo_heads * self.head_size
self.w = kwargs.get(
"w", {
"weight":
torch.randn(hidden_size, hidden_size).to(
dtype=FP8_DTYPE, device=self.device).t(),
"wscale":
torch.tensor([1.0], dtype=torch.float32, device=self.device),
"scale":
torch.tensor([1.0], dtype=torch.float32, device=self.device),
})
def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
"""Forward pass that creates the pattern to be fused."""
attn_output = self.attn(q, k, v)
return self.fp8_linear.apply(input=attn_output,
weight=self.w["weight"],
weight_scale=self.w["wscale"],
input_scale=self.w["scale"])
class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
"""Test model for AttentionNvfp4QuantPattern fusion."""
quant_key = kNvfp4Quant
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
hidden_size = self.num_qo_heads * self.head_size
self.w = kwargs.get(
"w", {
"weight":
torch.randint(256, (hidden_size, hidden_size // 2),
dtype=FP4_DTYPE,
device=self.device),
"wscale_swizzled":
torch.randn(hidden_size, hidden_size // 16).to(
dtype=FP8_DTYPE, device=self.device),
"wscale":
torch.tensor([500], dtype=torch.float32, device=self.device),
"scale":
torch.tensor([0.002], dtype=torch.float32, device=self.device),
})
def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
"""Forward pass that creates the pattern to be fused."""
attn_output = self.attn(q, k, v)
quant_output, output_block_scale = scaled_fp4_quant(
attn_output, 1 / self.w["scale"])
return cutlass_scaled_fp4_mm(a=quant_output,
b=self.w["weight"],
block_scale_a=output_block_scale,
block_scale_b=self.w["wscale_swizzled"],
alpha=self.w["scale"] * self.w["wscale"],
out_dtype=attn_output.dtype)
@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
@pytest.mark.parametrize("head_size", [128])
@pytest.mark.parametrize("batch_size", [7, 256, 533])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("model_name, model_class",
[("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
TestAttentionFp8StaticQuantPatternModel),
("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
TestAttentionNvfp4QuantPatternModel)])
@pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)),
reason="Only test on SM100(Blackwell)")
def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
head_size: int, batch_size: int,
dtype: torch.dtype, model_name: str,
model_class: type[AttentionQuantPatternModel],
backend: _Backend, monkeypatch, dist_init):
"""Test AttentionStaticQuantPattern fusion pass"""
monkeypatch.setenv("VLLM_USE_V1", "1")
device = torch.device("cuda:0")
torch.manual_seed(42)
vllm_config = VllmConfig(
model_config=ModelConfig(
model=model_name,
max_model_len=2048,
),
scheduler_config=SchedulerConfig(max_num_seqs=1024),
compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
custom_ops=["+quant_fp8"],
),
cache_config=CacheConfig(cache_dtype="fp8"))
# Create test inputs
q = torch.randn(batch_size,
num_qo_heads * head_size,
dtype=dtype,
device=device)
k = torch.randn(batch_size,
num_kv_heads * head_size,
dtype=dtype,
device=device)
v = torch.randn(batch_size,
num_kv_heads * head_size,
dtype=dtype,
device=device)
# Mark first dimension as dynamic for realistic testing
torch._dynamo.mark_dynamic(q, 0)
torch._dynamo.mark_dynamic(k, 0)
torch._dynamo.mark_dynamic(v, 0)
# Run model directly without compilation and fusion
vllm_config_unfused = copy.deepcopy(vllm_config)
with set_current_vllm_config(vllm_config_unfused), set_forward_context(
attn_metadata=None, vllm_config=vllm_config_unfused
), global_force_attn_backend_context_manager(backend):
model_unfused = model_class(num_qo_heads=num_qo_heads,
num_kv_heads=num_kv_heads,
head_size=head_size,
kv_cache_dtype=FP8_DTYPE,
device=device,
vllm_config=vllm_config_unfused)
model_unfused = model_unfused.to(device)
forward_ctx = get_forward_context()
forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
batch_size)
# Run model directly without compilation and fusion
result_unfused = model_unfused(q, k, v)
# Run model with attn fusion enabled
vllm_config.compilation_config.pass_config = PassConfig(
enable_attn_fusion=True, enable_noop=True)
with set_current_vllm_config(vllm_config), set_forward_context(
attn_metadata=None, vllm_config=vllm_config
), global_force_attn_backend_context_manager(backend):
model_fused = model_class(num_qo_heads=num_qo_heads,
num_kv_heads=num_kv_heads,
head_size=head_size,
kv_cache_dtype=FP8_DTYPE,
device=device,
vllm_config=vllm_config,
w=model_unfused.w)
model_fused = model_fused.to(device)
forward_ctx = get_forward_context()
forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
# Create test backend with fusion passes enabled
noop_pass = NoOpEliminationPass(vllm_config)
attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw
)
test_backend = TestBackend(noop_pass, attn_pass)
# Compile model with fusion enabled
model_compiled = torch.compile(model_fused,
backend=test_backend,
fullgraph=True)
assert model_compiled.attn._o_scale_float is None
result_fused_1 = model_compiled(q, k, v)
# After the 1st round of the forward pass, output quant scale should be
# loaded into the attn layer's _o_scale_float, the 2nd round should
# reuse the loaded _o_scale_float
assert model_compiled.attn._o_scale_float is not None
result_fused_2 = model_compiled(q, k, v)
assert model_compiled.attn._o_scale_float is not None
# Check attn fusion support
quant_key = model_class.quant_key
attn_fusion_supported = [
layer.impl.fused_output_quant_supported(quant_key) for key, layer in
vllm_config.compilation_config.static_forward_context.items()
]
if any(attn_fusion_supported):
# Check quantization ops in the graph before and after fusion
test_backend.check_before_ops([QUANT_OPS[quant_key]],
fully_replaced=True)
# Check attention ops in the graph before and after fusion
attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))
attn_nodes_post = list(find_op_nodes(ATTN_OP,
test_backend.graph_post_pass))
assert len(attn_nodes_pre) > 0, "Should have attention nodes before fusion"
assert len(attn_nodes_pre) == len(attn_nodes_post), \
"Should have same number of attention nodes before and after fusion"
assert attn_nodes_pre[0].kwargs.get("output_scale") is None, \
"Attention should not have output_scale before fusion"
assert attn_nodes_post[0].kwargs.get("output_scale") is not None, \
"Attention should have output_scale after fusion"
assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, \
"Attention should not have output_block_scale before fusion"
if quant_key.dtype == FP8_DTYPE:
assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, \
"Attention should not have output_block_scale after FP8 fusion"
elif quant_key.dtype == FP4_DTYPE:
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \
"Attention should have output_block_scale after FP4 fusion" # noqa: E501
# Check that results are closed
torch.testing.assert_close(result_unfused,
result_fused_1,
atol=1e-2,
rtol=1e-2)
torch.testing.assert_close(result_unfused,
result_fused_2,
atol=1e-2,
rtol=1e-2)
...@@ -4,35 +4,44 @@ import pytest ...@@ -4,35 +4,44 @@ import pytest
import torch import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe # yapf conflicts with isort for this block
# yapf: disable
from vllm.compilation.activation_quant_fusion import (
FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass)
# yapf: enable
from vllm.compilation.fusion import QUANT_OPS
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.config import CompilationConfig, PassConfig, VllmConfig
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape) GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
CUTLASS_FP8_SUPPORTED, Fp8LinearOp) Fp8LinearOp)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .backend import TestBackend from .backend import TestBackend
FP8_DTYPE = current_platform.fp8_dtype()
FP4_DTYPE = torch.uint8
class TestModel(torch.nn.Module):
def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args, def is_nvfp4_supported():
**kwargs): return current_platform.has_device_capability(100)
super().__init__(*args, **kwargs)
class TestSiluMulFp8QuantModel(torch.nn.Module):
def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
super().__init__()
self.silu_and_mul = SiluAndMul() self.silu_and_mul = SiluAndMul()
self.wscale = torch.rand(1, dtype=torch.float32) self.wscale = torch.rand(1, dtype=torch.float32)
self.scale = torch.rand(1, dtype=torch.float32) self.scale = torch.rand(1, dtype=torch.float32)
self.w = (torch.rand( self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
hidden_size,
hidden_size).to(dtype=current_platform.fp8_dtype()).t())
self.fp8_linear = Fp8LinearOp( self.fp8_linear = Fp8LinearOp(
cutlass_fp8_supported=cutlass_fp8_enabled, force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
act_quant_static=True, act_quant_static=True,
act_quant_group_shape=GroupShape.PER_TENSOR, act_quant_group_shape=GroupShape.PER_TENSOR,
) )
...@@ -45,15 +54,56 @@ class TestModel(torch.nn.Module): ...@@ -45,15 +54,56 @@ class TestModel(torch.nn.Module):
input_scale=self.wscale) input_scale=self.wscale)
return x2 return x2
def ops_in_model_before(self):
return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]]
def ops_in_model_after(self):
return [FUSED_OPS[kFp8StaticTensorSym]]
class TestSiluMulNvfp4QuantModel(torch.nn.Module):
def __init__(self, hidden_size: int, **kwargs):
super().__init__()
self.silu_and_mul = SiluAndMul()
self.w = torch.randint(256, (hidden_size, hidden_size // 2),
dtype=FP4_DTYPE)
self.wscale = torch.randn(hidden_size,
hidden_size // 16).to(dtype=FP8_DTYPE)
self.wscale2 = torch.rand(1, dtype=torch.float32)
self.scale = torch.rand(1, dtype=torch.float32)
@pytest.mark.parametrize("num_tokens", [256]) def forward(self, x):
@pytest.mark.parametrize("hidden_size", [64]) y = self.silu_and_mul(x)
@pytest.mark.parametrize("cutlass_fp8_enabled", y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
[True, False] if CUTLASS_FP8_SUPPORTED else [False]) out = cutlass_scaled_fp4_mm(a=y_quant,
b=self.w,
block_scale_a=y_block_scale,
block_scale_b=self.wscale,
alpha=self.scale * self.wscale2,
out_dtype=y.dtype)
return out
def ops_in_model_before(self):
return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]]
def ops_in_model_after(self):
return [FUSED_OPS[kNvfp4Quant]]
@pytest.mark.parametrize("num_tokens", [64])
@pytest.mark.parametrize("hidden_size", [128])
@pytest.mark.parametrize(
"model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
reason="Only test on CUDA and ROCm") reason="Only test on CUDA and ROCm")
def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
cutlass_fp8_enabled): force_fp8_e4m3fnuz):
if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
pytest.skip("Duplicate tests for NVFP4")
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.set_default_dtype(torch.float16) torch.set_default_dtype(torch.float16)
...@@ -64,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, ...@@ -64,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
fusion_pass = ActivationQuantFusionPass(config) fusion_pass = ActivationQuantFusionPass(config)
backend = TestBackend(NoOpEliminationPass(config), fusion_pass) backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
model = TestModel(hidden_size, cutlass_fp8_enabled) model = model_class(hidden_size=hidden_size,
force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
# First dimension dynamic # First dimension dynamic
x = torch.rand(num_tokens, hidden_size * 2) x = torch.rand(num_tokens, hidden_size * 2)
...@@ -81,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, ...@@ -81,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
atol=1e-3, atol=1e-3,
rtol=1e-3) rtol=1e-3)
# Check substitution worked # In pre-nodes, quant op should be present and fused kernels should not
pre_nodes = backend.graph_pre_pass.nodes backend.check_before_ops(model.ops_in_model_before())
post_nodes = backend.graph_post_pass.nodes
silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default
fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
# In pre-nodes, fp8 quant should be present and fused kernels should not
assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None
find_auto_fn(pre_nodes, fp8_quant)
# In post-nodes, fused kernels should be present and fp8 quant should not # In post-nodes, fused kernels should be present and quant op should not
find_auto_fn(post_nodes, silu_and_mul_quant) backend.check_after_ops(model.ops_in_model_after())
assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
import math
import os import os
import tempfile import tempfile
from enum import Enum from enum import Enum
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
import pytest import pytest
import pytest_html import pytest_html
...@@ -37,7 +37,7 @@ from vllm.logger import init_logger ...@@ -37,7 +37,7 @@ from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.sequence import Logprob
from vllm.transformers_utils.utils import maybe_model_redirect from vllm.transformers_utils.utils import maybe_model_redirect
from .utils import models_path_prefix from .utils import models_path_prefix
...@@ -460,9 +460,16 @@ class HfRunner: ...@@ -460,9 +460,16 @@ class HfRunner:
# output is final logits # output is final logits
all_inputs = self.get_inputs(prompts) all_inputs = self.get_inputs(prompts)
outputs = [] outputs = []
problem_type = getattr(self.config, "problem_type", "")
for inputs in all_inputs: for inputs in all_inputs:
output = self.model(**self.wrap_device(inputs)) output = self.model(**self.wrap_device(inputs))
logits = output.logits.softmax(dim=-1)[0].tolist() if problem_type == "regression":
logits = output.logits[0].tolist()
elif problem_type == "multi_label_classification":
logits = output.logits.sigmoid()[0].tolist()
else:
logits = output.logits.softmax(dim=-1)[0].tolist()
outputs.append(logits) outputs.append(logits)
return outputs return outputs
...@@ -600,7 +607,7 @@ class HfRunner: ...@@ -600,7 +607,7 @@ class HfRunner:
def _hidden_states_to_logprobs( def _hidden_states_to_logprobs(
self, self,
hidden_states: tuple[tuple[torch.Tensor, ...], ...], hidden_states: tuple[tuple[torch.Tensor, ...], ...],
num_logprobs: int, num_logprobs: Optional[int],
) -> tuple[list[dict[int, float]], int]: ) -> tuple[list[dict[int, float]], int]:
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
output_len = len(hidden_states) output_len = len(hidden_states)
...@@ -628,7 +635,7 @@ class HfRunner: ...@@ -628,7 +635,7 @@ class HfRunner:
self, self,
prompts: list[str], prompts: list[str],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: Optional[int],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
...@@ -675,7 +682,7 @@ class HfRunner: ...@@ -675,7 +682,7 @@ class HfRunner:
self, self,
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: Optional[int],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
**kwargs: Any, **kwargs: Any,
) -> list[TokensTextLogprobs]: ) -> list[TokensTextLogprobs]:
...@@ -964,7 +971,7 @@ class VllmRunner: ...@@ -964,7 +971,7 @@ class VllmRunner:
self, self,
prompts: list[str], prompts: list[str],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int] = None, num_prompt_logprobs: Optional[int] = None,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
...@@ -989,11 +996,40 @@ class VllmRunner: ...@@ -989,11 +996,40 @@ class VllmRunner:
videos=videos, videos=videos,
**kwargs) **kwargs)
def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
"""
Return the perplexity score associated with generating the prompts
:param prompts: list of prompts to score
:return: perplexity score of each prompt
"""
outputs = self.generate_greedy_logprobs(prompts,
max_tokens=1,
num_logprobs=None,
num_prompt_logprobs=0)
perplexities = []
for output in outputs:
output = cast(TokensTextLogprobsPromptLogprobs, output)
token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
assert token_datas[0] is None
token_log_probs = []
for token_data in token_datas[1:]:
assert token_data is not None
assert len(token_data) == 1
token_log_prob = list(token_data.values())[0].logprob
token_log_probs.append(token_log_prob)
perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs))
perplexities.append(perplexity)
return perplexities
def generate_encoder_decoder_greedy_logprobs( def generate_encoder_decoder_greedy_logprobs(
self, self,
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int] = None, num_prompt_logprobs: Optional[int] = None,
skip_special_tokens: bool = True, skip_special_tokens: bool = True,
) -> Union[list[TokensTextLogprobs], ) -> Union[list[TokensTextLogprobs],
...@@ -1020,15 +1056,17 @@ class VllmRunner: ...@@ -1020,15 +1056,17 @@ class VllmRunner:
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
concurrency_limit: Optional[int] = None,
) -> list[tuple[list[list[int]], list[str]]]: ) -> list[tuple[list[list[int]], list[str]]]:
inputs = self.get_inputs(prompts, inputs = self.get_inputs(prompts,
images=images, images=images,
videos=videos, videos=videos,
audios=audios) audios=audios)
outputs = self.llm.beam_search( outputs = self.llm.beam_search(inputs,
inputs, BeamSearchParams(beam_width=beam_width,
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) max_tokens=max_tokens),
concurrency_limit=concurrency_limit)
returned_outputs = [] returned_outputs = []
for output in outputs: for output in outputs:
token_ids = [x.tokens for x in output.sequences] token_ids = [x.tokens for x in output.sequences]
...@@ -1086,6 +1124,9 @@ class VllmRunner: ...@@ -1086,6 +1124,9 @@ class VllmRunner:
return self.llm.llm_engine.collective_rpc(_apply_model) return self.llm.llm_engine.collective_rpc(_apply_model)
def get_llm(self) -> LLM:
return self.llm
def __enter__(self): def __enter__(self):
return self return self
......
...@@ -34,7 +34,7 @@ BLOCK_SIZE = 16 ...@@ -34,7 +34,7 @@ BLOCK_SIZE = 16
@pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator, def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
batch_size, seed, backend, monkeypatch): batch_size, seed, backend, monkeypatch):
""" """
...@@ -45,8 +45,6 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator, ...@@ -45,8 +45,6 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
Additionally, we compare the results of the v1 and v2 managers. Additionally, we compare the results of the v1 and v2 managers.
""" """
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm(): if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.") pytest.skip("Xformers does not support ROCm/HIP.")
...@@ -98,7 +96,7 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator, ...@@ -98,7 +96,7 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}]) @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
@pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
backend, monkeypatch): backend, monkeypatch):
""" """
...@@ -109,8 +107,6 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, ...@@ -109,8 +107,6 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
The results with and without chunked prefill are not the same due to The results with and without chunked prefill are not the same due to
numerical instabilities. numerical instabilities.
""" """
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm(): if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.") pytest.skip("Xformers does not support ROCm/HIP.")
override_backend_env_variable(monkeypatch, backend) override_backend_env_variable(monkeypatch, backend)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment