Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
...@@ -26,6 +26,11 @@ TEST_IMAGE_URLS = [ ...@@ -26,6 +26,11 @@ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
] ]
TEST_VIDEO_URLS = [
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
"https://filesamples.com/samples/video/avi/sample_640x360.avi",
]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def url_images() -> dict[str, Image.Image]: def url_images() -> dict[str, Image.Image]:
...@@ -134,6 +139,18 @@ async def test_fetch_image_local_files(image_url: str): ...@@ -134,6 +139,18 @@ async def test_fetch_image_local_files(image_url: str):
f"file://{temp_dir}/../{os.path.basename(image_url)}") f"file://{temp_dir}/../{os.path.basename(image_url)}")
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
async def test_fetch_video_http(video_url: str, num_frames: int):
connector = MediaConnector()
video_sync = connector.fetch_video(video_url, num_frames=num_frames)
video_async = await connector.fetch_video_async(video_url,
num_frames=num_frames)
assert np.array_equal(video_sync, video_async)
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
class TestCase(NamedTuple): class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict" mm_positions: "MultiModalPlaceholderDict"
......
# SPDX-License-Identifier: Apache-2.0
import numpy as np
import numpy.typing as npt
import pytest
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
NUM_FRAMES = 10
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@VIDEO_LOADER_REGISTRY.register("test_video_loader_1")
class TestVideoLoader1(VideoLoader):
@classmethod
def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
return FAKE_OUTPUT_1
@VIDEO_LOADER_REGISTRY.register("test_video_loader_2")
class TestVideoLoader2(VideoLoader):
@classmethod
def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
return FAKE_OUTPUT_2
def test_video_loader_registry():
custom_loader_1 = VIDEO_LOADER_REGISTRY.load("test_video_loader_1")
output_1 = custom_loader_1.load_bytes(b"test")
np.testing.assert_array_equal(output_1, FAKE_OUTPUT_1)
custom_loader_2 = VIDEO_LOADER_REGISTRY.load("test_video_loader_2")
output_2 = custom_loader_2.load_bytes(b"test")
np.testing.assert_array_equal(output_2, FAKE_OUTPUT_2)
def test_video_loader_type_doesnt_exist():
with pytest.raises(AssertionError):
VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
# SPDX-License-Identifier: Apache-2.0
import os
from unittest.mock import MagicMock
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.platforms.neuron import NeuronFramework
from vllm.sampling_params import SamplingParams
from vllm.sequence import SequenceData, SequenceGroupMetadata
from vllm.worker.neuron_model_runner import NeuronModelRunner
os.environ[
'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
def _create_neuron_model_runner(model: str, *args,
**kwargs) -> NeuronModelRunner:
engine_args = EngineArgs(model, *args, **kwargs)
engine_config = engine_args.create_engine_config()
vllm_config = VllmConfig(
model_config=engine_config.model_config,
parallel_config=engine_config.parallel_config,
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
)
neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
return neuron_model_runner
def test_update_neuron_sampling_params_not_full_batch():
os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
model_runner = _create_neuron_model_runner(
"facebook/opt-125m",
seed=0,
dtype="float16",
max_num_seqs=2,
)
assert not model_runner._on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if current_platform.use_transformers_neuronx():
model_mock = MagicMock()
model_runner.model = model_mock
seq_group_metadata_list = [
SequenceGroupMetadata(
request_id="test_0",
is_prompt=True,
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
sampling_params=SamplingParams(temperature=0.5,
top_k=1,
top_p=0.5),
block_tables={0: [1]},
)
]
model_runner.prepare_model_input(seq_group_metadata_list)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: default sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params = (
model_runner.model_config.neuron_sampling_params)
assert neuron_sampling_params.temperature == [1.0, 0.5]
assert neuron_sampling_params.top_k == [
model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
]
assert neuron_sampling_params.top_p == [1.0, 0.5]
model_mock.model.update_generation_config.assert_called_once_with(
neuron_sampling_params)
def test_update_neuron_sampling_params_full_batch():
os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
model_runner = _create_neuron_model_runner(
"facebook/opt-125m",
seed=0,
dtype="float16",
max_num_seqs=2,
)
assert not model_runner._on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if current_platform.use_transformers_neuronx():
model_mock = MagicMock()
model_runner.model = model_mock
seq_group_metadata_list = [
SequenceGroupMetadata(
request_id="test_0",
is_prompt=True,
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
sampling_params=SamplingParams(temperature=0.5,
top_k=1,
top_p=0.5),
block_tables={0: [1]},
),
SequenceGroupMetadata(
request_id="test_0",
is_prompt=True,
seq_data={1: SequenceData.from_seqs([4, 5, 6])},
sampling_params=SamplingParams(temperature=0.2,
top_k=2,
top_p=0.2),
block_tables={1: [0]},
)
]
model_runner.prepare_model_input(seq_group_metadata_list)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: sequence 1's sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params = (
model_runner.model_config.neuron_sampling_params)
assert neuron_sampling_params.temperature == [0.2, 0.5]
assert neuron_sampling_params.top_k == [2, 1]
assert neuron_sampling_params.top_p == [0.2, 0.5]
model_mock.model.update_generation_config.assert_called_once_with(
neuron_sampling_params)
...@@ -11,14 +11,16 @@ from vllm.platforms import current_platform ...@@ -11,14 +11,16 @@ from vllm.platforms import current_platform
@pytest.mark.parametrize( @pytest.mark.parametrize(
"max_position,is_neox_style,rotary_dim,head_size,seq_len", [ "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
(16, False, 32, 32, 1024), (16, False, 32, 32, 1024, True),
(16, False, 32, 128, 1024), (16, False, 32, 128, 1024, True),
(16, True, 32, 32, 1024), (16, True, 32, 32, 1024, True),
(16, True, 32, 128, 1024), (16, True, 32, 128, 1024, True),
(16, False, 32, 128, 1024, False),
(16, True, 32, 128, 1024, False),
]) ])
def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim, def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
head_size, seq_len): head_size, seq_len, use_key):
import torch_xla.core.xla_model as xm import torch_xla.core.xla_model as xm
device = xm.xla_device() device = xm.xla_device()
...@@ -40,19 +42,26 @@ def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim, ...@@ -40,19 +42,26 @@ def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
num_heads * head_size, num_heads * head_size,
dtype=torch.float32, dtype=torch.float32,
device="cpu") device="cpu")
key = torch.randn_like(query) key = torch.randn_like(query) if use_key else None
assert positions.is_cpu, \ assert positions.is_cpu, \
"reference input tensor is expected to be CPU tensor." "reference input tensor is expected to be CPU tensor."
ref_query, ref_key = rot.to(device="cpu").forward_native( ref_query, ref_key = rot.to(device="cpu").forward_native(
positions, query, key) positions, query, key)
out_query, out_key = rot.to(device=device).forward_neuron( out_query, out_key = rot.to(device=device).forward_neuron(
positions.to(device=device), query.to(device=device), positions.to(device=device), query.to(device=device),
key.to(device=device)) key.to(device=device) if key is not None else None)
assert out_query.is_xla and out_key.is_xla, \ if use_key:
"output tensor is expected to be XLA tensor" assert out_query.is_xla and out_key.is_xla, \
"output tensor is expected to be XLA tensor"
torch.testing.assert_close(out_key.cpu(),
ref_key,
atol=1e-2,
rtol=1e-2)
else:
assert out_key is None, "expected returned key to be None"
assert out_query.is_xla, \
"output tensor is expected to be XLA tensor"
torch.testing.assert_close(out_query.cpu(), torch.testing.assert_close(out_query.cpu(),
ref_query, ref_query,
atol=1e-2, atol=1e-2,
rtol=1e-2) rtol=1e-2)
torch.testing.assert_close(out_key.cpu(), ref_key, atol=1e-2, rtol=1e-2)
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
def test_mistral():
llm = LLM(model="mistralai/Mistral-7B-v0.1",
tensor_parallel_size=2,
max_num_seqs=4,
max_model_len=512,
use_v2_block_manager=True,
override_neuron_config={
"sequence_parallel_enabled": False,
"skip_warmup": True
},
device="neuron")
prompts = [
"The president of the United States is",
"The capital of France is",
]
outputs = llm.generate(prompts, SamplingParams(top_k=1))
expected_outputs = [
" the most powerful person in the world. He is the head of state "
"and head",
" a city of many faces. It is a city of history, culture, art"
]
for expected_output, output in zip(expected_outputs, outputs):
generated_text = output.outputs[0].text
assert (expected_output == generated_text)
# SPDX-License-Identifier: Apache-2.0
import os
import shutil
import pytest
from huggingface_hub import snapshot_download
from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
LORA_NAME = "typeof/zephyr-7b-beta-lora"
PA_NAME = "swapnilbp/llama_tweet_ptune"
@pytest.fixture(scope='module')
def adapter_cache(request, tmpdir_factory):
# Create dir that mimics the structure of the adapter cache
adapter_cache = tmpdir_factory.mktemp(
request.module.__name__) / "adapter_cache"
return adapter_cache
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def pa_files():
return snapshot_download(repo_id=PA_NAME)
@pytest.mark.asyncio
async def test_filesystem_resolver(adapter_cache, zephyr_lora_files):
model_files = adapter_cache / LORA_NAME
shutil.copytree(zephyr_lora_files, model_files)
fs_resolver = FilesystemResolver(adapter_cache)
assert fs_resolver is not None
lora_request = await fs_resolver.resolve_lora(MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert lora_request.lora_path == os.path.join(adapter_cache, LORA_NAME)
@pytest.mark.asyncio
async def test_missing_adapter(adapter_cache):
fs_resolver = FilesystemResolver(adapter_cache)
assert fs_resolver is not None
missing_lora_request = await fs_resolver.resolve_lora(MODEL_NAME, "foobar")
assert missing_lora_request is None
@pytest.mark.asyncio
async def test_nonlora_adapter(adapter_cache, pa_files):
model_files = adapter_cache / PA_NAME
shutil.copytree(pa_files, model_files)
fs_resolver = FilesystemResolver(adapter_cache)
assert fs_resolver is not None
pa_request = await fs_resolver.resolve_lora(MODEL_NAME, PA_NAME)
assert pa_request is None
# SPDX-License-Identifier: Apache-2.0
"""Test model set-up and inference for quantized HF models supported
on the AutoRound.
Validating the configuration and printing results for manual checking.
Run `pytest tests/quantization/test_auto_round.py`.
"""
import pytest
from vllm.platforms import current_platform
MODELS = [
"OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc", ##auto_round:auto_gptq
"Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound" ##auto_round:auto_awq
]
@pytest.mark.skipif(not current_platform.is_cpu()
and not current_platform.is_xpu()
and not current_platform.is_cuda(),
reason="only supports CPU/XPU/CUDA backend.")
@pytest.mark.parametrize("model", MODELS)
def test_auto_round(vllm_runner, model):
with vllm_runner(model) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=8)
assert output
print(f"{output[0][1]}")
...@@ -8,9 +8,11 @@ import gc ...@@ -8,9 +8,11 @@ import gc
import pytest import pytest
import torch import torch
from transformers import BitsAndBytesConfig
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ..models.utils import check_embeddings_close
from ..utils import compare_two_settings, create_new_process_for_each_test from ..utils import compare_two_settings, create_new_process_for_each_test
models_4bit_to_test = [ models_4bit_to_test = [
...@@ -19,6 +21,10 @@ models_4bit_to_test = [ ...@@ -19,6 +21,10 @@ models_4bit_to_test = [
"quantize inflight model with both HF and Mistral format weights") "quantize inflight model with both HF and Mistral format weights")
] ]
models_4bit_to_embedding_test = [
("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
]
models_pre_qaunt_4bit_to_test = [ models_pre_qaunt_4bit_to_test = [
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed', ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
'read pre-quantized 4-bit FP4 model'), 'read pre-quantized 4-bit FP4 model'),
...@@ -31,6 +37,12 @@ models_pre_quant_8bit_to_test = [ ...@@ -31,6 +37,12 @@ models_pre_quant_8bit_to_test = [
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"), ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
] ]
models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8',
'read pre-quantized llama 8-bit model'),
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
...@@ -39,7 +51,8 @@ models_pre_quant_8bit_to_test = [ ...@@ -39,7 +51,8 @@ models_pre_quant_8bit_to_test = [
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None: model_name, description) -> None:
hf_model_kwargs = {"load_in_4bit": True} hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1], validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
model_name, False, hf_model_kwargs) model_name, False, hf_model_kwargs)
...@@ -77,7 +90,8 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts, ...@@ -77,7 +90,8 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None: model_name, description) -> None:
hf_model_kwargs = {"load_in_4bit": True} hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
validate_generated_texts(hf_runner, validate_generated_texts(hf_runner,
vllm_runner, vllm_runner,
example_prompts[:1], example_prompts[:1],
...@@ -113,6 +127,54 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None: ...@@ -113,6 +127,54 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
compare_two_settings(model_name, common_args, pp_args) compare_two_settings(model_name, common_args, pp_args)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_4bit_to_embedding_test)
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
def test_4bit_bnb_embedding_model(
model_name,
description,
hf_runner,
vllm_runner,
example_prompts,
dtype: str,
) -> None:
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
# This makes the input_ids different between hf_model and vllm_model.
# So we need to strip the input texts to avoid test failing.
example_prompts = [str(s).strip() for s in example_prompts]
# Inflight 4bit quantization
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
with hf_runner(
model_name,
dtype=dtype,
model_kwargs=hf_model_kwargs,
is_sentence_transformer=True,
) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model_name,
task="embed",
dtype=dtype,
quantization="bitsandbytes") as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=5e-2,
)
def log_generated_texts(prompts, outputs, runner_name): def log_generated_texts(prompts, outputs, runner_name):
logged_texts = [] logged_texts = []
for i, (_, generated_text) in enumerate(outputs): for i, (_, generated_text) in enumerate(outputs):
......
...@@ -13,9 +13,9 @@ from compressed_tensors.quantization import QuantizationType ...@@ -13,9 +13,9 @@ from compressed_tensors.quantization import QuantizationType
from tests.models.utils import check_logprobs_close from tests.models.utils import check_logprobs_close
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensors24, CompressedTensorsLinearMethod, CompressedTensors24, CompressedTensorsLinearMethod,
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
CompressedTensorsWNA16) CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported) sparse_cutlass_supported)
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -648,3 +648,23 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): ...@@ -648,3 +648,23 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
output = llm.generate_greedy("Hello my name is", max_tokens=20) output = llm.generate_greedy("Hello my name is", max_tokens=20)
print(output) print(output)
assert output assert output
def test_compressed_tensors_nvfp4a16(vllm_runner):
# run weight only example
model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
with vllm_runner(model, enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
assert qkv_proj.scheme.group_size == 16
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
print(output)
assert output
...@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_quark.py`. ...@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_quark.py`.
""" """
import pytest import pytest
import torch
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8) QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
...@@ -63,3 +64,28 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp): ...@@ -63,3 +64,28 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
output = llm.generate_greedy("Hello my name is", max_tokens=20) output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output assert output
def test_quark_fp8_parity(vllm_runner):
quark_model_id = "amd-quark/llama-tiny-fp8-quark-quant-method"
fp8_model_id = "amd-quark/llama-tiny-fp8-quant-method"
llm_kwargs = {
"tensor_parallel_size": 1,
"enforce_eager": True,
"gpu_memory_utilization": 0.1
}
with (vllm_runner(quark_model_id, **llm_kwargs) as
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
quark_model = (quark_handle.model.llm_engine.model_executor.
driver_worker.model_runner.model)
quark_state_dict = quark_model.state_dict()
fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker.
model_runner.model)
fp8_state_dict = fp8_model.state_dict()
assert fp8_state_dict.keys() == quark_state_dict.keys()
for key in fp8_state_dict:
assert torch.equal(fp8_state_dict[key], quark_state_dict[key])
...@@ -14,7 +14,7 @@ import torch.nn.functional as F ...@@ -14,7 +14,7 @@ import torch.nn.functional as F
from vllm.model_executor.layers.linear import LinearBase # noqa: E501 from vllm.model_executor.layers.linear import LinearBase # noqa: E501
from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.quantization import ( from vllm.model_executor.layers.quantization import (
get_quantization_config, register_quantization_config) QuantizationMethods, get_quantization_config, register_quantization_config)
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig) QuantizationConfig)
...@@ -54,7 +54,7 @@ class CustomQuantConfig(QuantizationConfig): ...@@ -54,7 +54,7 @@ class CustomQuantConfig(QuantizationConfig):
"""Initialize the quantization config.""" """Initialize the quantization config."""
self.num_bits = num_bits self.num_bits = num_bits
def get_name(self) -> str: def get_name(self) -> QuantizationMethods:
"""Name of the quantization method.""" """Name of the quantization method."""
return "custom_quant" return "custom_quant"
......
...@@ -3,6 +3,7 @@ import importlib.metadata ...@@ -3,6 +3,7 @@ import importlib.metadata
import importlib.util import importlib.util
import pytest import pytest
import torch
DTYPE = ["bfloat16"] DTYPE = ["bfloat16"]
...@@ -21,5 +22,42 @@ def test_pre_quantized_model(vllm_runner): ...@@ -21,5 +22,42 @@ def test_pre_quantized_model(vllm_runner):
print(output) print(output)
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
@pytest.mark.parametrize(
"pt_load_map_location",
[
"cuda:0",
# {"": "cuda"},
])
def test_opt_125m_int4wo_model_loading_with_params(vllm_runner,
pt_load_map_location):
torch._dynamo.reset()
model_name = "jerryzh168/opt-125m-int4wo"
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location=pt_load_map_location) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
assert output
print(output)
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
torch._dynamo.reset()
model_name = "jerryzh168/opt-125m-int4wo-per-module"
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0") as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
assert output
print(output)
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__]) pytest.main([__file__])
# SPDX-License-Identifier: Apache-2.0
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "qwen3"
start_token = "<think>"
end_token = "</think>"
REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def qwen3_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# 带 <think></think>,非stream
WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
}
# 带 <think></think>,stream
WITH_THINK_STREAM = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
}
# 不带 <think></think>,非stream
WITHOUT_THINK = {
"output": "This is the rest",
"reasoning_content": None,
"content": "This is the rest",
}
# 不带 <think></think>,stream
WITHOUT_THINK_STREAM = {
"output": "This is the rest",
"reasoning_content": None,
"content": "This is the rest",
}
COMPLETE_REASONING = {
"output": "<think>This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"content": None,
}
MULTILINE_REASONING = {
"output":
"<think>This is a reasoning\nsection</think>This is the rest\nThat",
"reasoning_content": "This is a reasoning\nsection",
"content": "This is the rest\nThat",
}
ONLY_OPEN_TAG = {
"output": "<think>This is a reasoning section",
"reasoning_content": None,
"content": "<think>This is a reasoning section",
}
ONLY_OPEN_TAG_STREAM = {
"output": "<think>This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"content": None,
}
TEST_CASES = [
pytest.param(
False,
WITH_THINK,
id="with_think",
),
pytest.param(
True,
WITH_THINK_STREAM,
id="with_think_stream",
),
pytest.param(
False,
WITHOUT_THINK,
id="without_think",
),
pytest.param(
True,
WITHOUT_THINK_STREAM,
id="without_think_stream",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_stream",
),
pytest.param(
False,
MULTILINE_REASONING,
id="multiline_reasoning",
),
pytest.param(
True,
MULTILINE_REASONING,
id="multiline_reasoning_stream",
),
pytest.param(
False,
ONLY_OPEN_TAG,
id="only_open_tag",
),
pytest.param(
True,
ONLY_OPEN_TAG_STREAM,
id="only_open_tag_stream",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
qwen3_tokenizer,
):
output = qwen3_tokenizer.tokenize(param_dict["output"])
output_tokens: list[str] = [
qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(qwen3_tokenizer)
reasoning, content = run_reasoning_extraction(parser,
output_tokens,
streaming=streaming)
assert reasoning == param_dict["reasoning_content"]
assert content == param_dict["content"]
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import LoadConfig, LoadFormat from vllm.config import LoadConfig, LoadFormat
from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader, from vllm.model_executor.model_loader import get_model_loader
get_model_loader)
test_model = "openai-community/gpt2" test_model = "openai-community/gpt2"
...@@ -24,7 +23,7 @@ def get_runai_model_loader(): ...@@ -24,7 +23,7 @@ def get_runai_model_loader():
def test_get_model_loader_with_runai_flag(): def test_get_model_loader_with_runai_flag():
model_loader = get_runai_model_loader() model_loader = get_runai_model_loader()
assert isinstance(model_loader, RunaiModelStreamerLoader) assert model_loader.__class__.__name__ == "RunaiModelStreamerLoader"
def test_runai_model_loader_download_files(vllm_runner): def test_runai_model_loader_download_files(vllm_runner):
......
...@@ -169,7 +169,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, ...@@ -169,7 +169,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
@pytest.mark.parametrize("n_rep", [100]) @pytest.mark.parametrize("n_rep", [100])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False]) # @pytest.mark.parametrize("use_flashinfer", [True, False])
# Not testing FlashInfer now, since 0.2.3 API removed the ability
# to pass in uniform samples.
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int, def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
frac_seeded: float, n_rep: int, device: str, frac_seeded: float, n_rep: int, device: str,
...@@ -214,7 +217,10 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int, ...@@ -214,7 +217,10 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", [3, 8, 32, 128]) @pytest.mark.parametrize("batch_size", [3, 8, 32, 128])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False]) # @pytest.mark.parametrize("use_flashinfer", [True, False])
# Not testing FlashInfer now, since 0.2.3 API removed the ability
# to pass in uniform samples.
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int, def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
device: str, use_flashinfer: bool): device: str, use_flashinfer: bool):
...@@ -284,6 +290,10 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int, ...@@ -284,6 +290,10 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
Test the flashinfer and nonflashinfer backend generate Test the flashinfer and nonflashinfer backend generate
the same output metrics. the same output metrics.
""" """
pytest.skip("Not testing FlashInfer now, since 0.2.3 API removed "
"the ability to pass in uniform samples.")
torch.set_default_device(device) torch.set_default_device(device)
torch.manual_seed(0) torch.manual_seed(0)
draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
......
...@@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str): ...@@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=random.random() + 0.1, temperature=random.random() + 0.1,
top_p=min(random.random() + 0.1, 1), top_p=min(random.random() + 0.1, 1),
top_k=random.randint(0, 10) or -1, top_k=random.randint(0, 10),
n=n, n=n,
presence_penalty=random.randint(0, 1), presence_penalty=random.randint(0, 1),
) )
...@@ -647,6 +647,8 @@ def test_flashinfer_fallback(seed: int, device: str): ...@@ -647,6 +647,8 @@ def test_flashinfer_fallback(seed: int, device: str):
if not envs.VLLM_USE_FLASHINFER_SAMPLER: if not envs.VLLM_USE_FLASHINFER_SAMPLER:
pytest.skip("Flashinfer sampler is disabled") pytest.skip("Flashinfer sampler is disabled")
pytest.skip("After FlashInfer 0.2.3, sampling will never fail")
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
......
...@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph( ...@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"block_size": 8, "block_size": 16,
# 2 for small prompt, 256//8 for generated. # 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8, "num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8, "max_model_len": (2 + 256 // 8) * 8,
......
...@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, ...@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"block_size": 8, "block_size": 16,
# 2 for small prompt, 256//8 for generated. # 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8, "num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8, "max_model_len": (2 + 256 // 8) * 8,
...@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption( ...@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"block_size": 8, "block_size": 16,
# 2 for small prompt, 256//8 for generated. # 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8, "num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8, "max_model_len": (2 + 256 // 8) * 8,
......
...@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( ...@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"block_size": 8, "block_size": 16,
# 2 for small prompt, 256//8 for generated. # 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8, "num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8, "max_model_len": (2 + 256 // 8) * 8,
...@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( ...@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
# As of this writing, vLLM only compiles with these 3 block sizes by # https://github.com/triton-lang/triton/issues/2266 tl.dot
# default. # doesn't support embedding < 16
{
"block_size": 8,
},
{ {
"block_size": 16, "block_size": 16,
}, },
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment