Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
......@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
......@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
......@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.model.encode(
llm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
assert llm_output == f"""truncate_prompt_tokens value
......
......@@ -37,6 +37,8 @@ if current_platform.is_rocm():
REQUIRES_V0_MODELS = [
# V1 Test: not enough KV cache space in C1.
"fuyu",
# V1 Test: Deadlock issue when processing mm_inputs
"llava-onevision-transformers",
]
# yapf: disable
......@@ -155,6 +157,7 @@ VLM_TEST_SETTINGS = {
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
num_logprobs= 6 if current_platform.is_cpu() else 5,
auto_cls=AutoModelForTextToWaveform,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
......@@ -172,6 +175,71 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
#### Transformers fallback to test
## To reduce test burden, we only test batching arbitrary image size
# Dynamic image length and number of patches
"llava-onevision-transformers": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "transformers",
},
marks=[pytest.mark.core_model],
),
# FIXME(Isotr0py): Enable this test after
# https://github.com/huggingface/transformers/pull/39470 released
# "idefics3-transformers": VLMTestInfo(
# models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
# prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
# img_idx_to_prompt=lambda idx: "<image>",
# max_model_len=8192,
# max_num_seqs=2,
# auto_cls=AutoModelForImageTextToText,
# hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
# image_size_factors=[(0.25, 0.5, 1.0)],
# vllm_runner_kwargs={
# "model_impl": "transformers",
# },
# marks=[pytest.mark.core_model],
# ),
# Pixel values from processor are not 4D or 5D arrays
"qwen2_5_vl-transformers": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={
"model_impl": "transformers",
},
marks=[large_gpu_mark(min_gb=32)],
),
# Check "auto" with fallback to transformers
"internvl-transformers": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
max_model_len=4096,
use_tokenizer_eos=True,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "auto",
},
auto_cls=AutoModelForImageTextToText,
marks=[pytest.mark.core_model],
),
#### Extended model tests
"aria": VLMTestInfo(
models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
......@@ -320,6 +388,7 @@ VLM_TEST_SETTINGS = {
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=32)],
),
"glm4_1v-video": VLMTestInfo(
models=["THUDM/GLM-4.1V-9B-Thinking"],
......@@ -333,8 +402,7 @@ VLM_TEST_SETTINGS = {
inputs=custom_inputs.video_with_metadata_glm4_1v(),
limit_mm_per_prompt={"video": 1},
)],
# This is needed to run on machine with 24GB VRAM
vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo(
models = [
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Create a reduced-layer version of the Maverick model for testing purposes.
This script creates a new model with fewer layers by:
1. Loading the original Maverick model configuration
2. Creating a reduced configuration
3. Generating compatible safetensors files with appropriate weights
4. Creating the necessary index files for vLLM compatibility
"""
import json
import shutil
from pathlib import Path
from typing import Any
import pytest
import torch
from safetensors.torch import save_file
from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
GenerationConfig)
from vllm import LLM, SamplingParams
from ....utils import multi_gpu_test
# Sample prompts for testing
PROMPTS: list[str] = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
def run_maverick_serving(model: str):
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
options with reduced layers.
"""
try:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model=model,
max_model_len=2048,
enforce_eager=True,
tensor_parallel_size=8,
enable_expert_parallel=True,
trust_remote_code=True,
gpu_memory_utilization=0.4,
kv_cache_dtype="fp8",
)
outputs = llm.generate(PROMPTS, sampling_params)
# Print the outputs
print("\nGenerated Outputs:\n" + "-" * 60)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
except Exception as e:
print(f"Error initializing or running model: {e}")
raise
def create_reduced_maverick_model(
original_model_name:
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
output_dir: str = "/tmp/reduced_maverick",
text_layers: int = 4,
num_experts: int = 4,
vision_layers: int = 2,
force_recreate: bool = False,
) -> str:
"""
Create a reduced-layer version of the Maverick model.
Args:
original_model_name: Name of the original Maverick model
output_dir: Directory to save the reduced model
text_layers: Number of text transformer layers
num_experts: Number of experts per layer
vision_layers: Number of vision transformer layers
force_recreate: Whether to recreate if output_dir already exists
Returns:
Path to the created reduced model directory
"""
print(
f"Creating reduced Maverick model with {text_layers} text layers and "
f"{vision_layers} vision layers...")
# Create output directory
output_path = Path(output_dir)
if output_path.exists():
if force_recreate:
shutil.rmtree(output_path)
else:
print(f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite.")
return str(output_path)
output_path.mkdir(parents=True, exist_ok=True)
try:
print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(original_model_name,
trust_remote_code=True)
print("Creating reduced configuration...")
reduced_config = create_reduced_config(original_config, text_layers,
num_experts, vision_layers)
config_path = output_path / "config.json"
with open(config_path, "w") as f:
json.dump(reduced_config, f, indent=2)
print(f"Saved reduced config to {config_path}")
print("Copying tokenizer files...")
copy_tokenizer_files(original_model_name, output_path)
print("Creating reduced safetensors files...")
create_reduced_safetensors(original_config, reduced_config,
output_path)
print("Creating preprocessor config...")
create_preprocessor_config(original_config, output_path)
try:
gen_config = GenerationConfig.from_pretrained(original_model_name)
gen_config.save_pretrained(output_path)
print("Copied generation config")
except Exception as e:
print(f"Could not copy generation config: {e}")
print(f"Successfully created reduced Maverick model at {output_path}")
return str(output_path)
except Exception as e:
print(f"Error creating reduced model: {e}")
# Clean up on failure
if output_path.exists():
shutil.rmtree(output_path)
raise
def create_reduced_config(original_config: Any, text_layers: int,
num_experts: int,
vision_layers: int) -> dict[str, Any]:
"""Create a reduced configuration based on the original."""
# Convert config to dictionary
config_dict = original_config.to_dict()
# Reduce text layers
if "text_config" in config_dict:
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
config_dict["text_config"]["num_hidden_layers"] = text_layers
print(
f"Reduced text layers from {original_text_layers} to {text_layers}"
)
original_num_experts = config_dict["text_config"]["num_local_experts"]
config_dict["text_config"]["num_local_experts"] = num_experts
print(
f"Reduced num experts from {original_num_experts} to {num_experts}"
)
hidden_dim_divisor = 4
original_hidden_size = config_dict["text_config"]["hidden_size"]
new_hidden_size = original_hidden_size // hidden_dim_divisor
config_dict["text_config"]["hidden_size"] = new_hidden_size
print(f"Reduced hidden size from {original_hidden_size} to "
f"{new_hidden_size}")
original_head_dim = config_dict["text_config"]["head_dim"]
new_head_dim = original_head_dim // hidden_dim_divisor
config_dict["text_config"]["head_dim"] = new_head_dim
print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
# Reduce vision layers
if "vision_config" in config_dict:
original_vision_layers = config_dict["vision_config"][
"num_hidden_layers"]
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
print(f"Reduced vision layers from {original_vision_layers} "
f"to {vision_layers}")
# Update model name to indicate it's a reduced version
config_dict["_name_or_path"] = (
f"reduced_maverick_{text_layers}t_{vision_layers}v")
return config_dict
def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
"""Copy tokenizer files from the original model."""
try:
tokenizer = AutoTokenizer.from_pretrained(original_model_name,
trust_remote_code=True)
tokenizer.save_pretrained(output_path)
print("Tokenizer files copied successfully")
except Exception as e:
print(f"Warning: Could not copy tokenizer files: {e}")
def create_preprocessor_config(original_config: Any,
output_path: Path) -> None:
"""Create preprocessor_config.json for multimodal model."""
# Try to load the original preprocessor config
try:
processor = AutoProcessor.from_pretrained(
original_config._name_or_path
or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
trust_remote_code=True,
)
processor.save_pretrained(output_path)
print("Copied original preprocessor config")
return
except Exception as e:
print(f"Could not copy original preprocessor config: {e}")
raise
def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
Any],
output_path: Path) -> None:
"""Create safetensors files with weights for the reduced model."""
print("Generating synthetic weights for reduced model...")
text_config = reduced_config["text_config"]
vision_config = reduced_config["vision_config"]
weights = {}
print("Creating text model weights...")
weights.update(create_text_model_weights(text_config))
print("Creating vision model weights...")
weights.update(create_vision_model_weights(vision_config))
print("Creating shared model weights...")
weights.update(create_shared_weights(text_config, vision_config))
print("Saving weights to safetensors files...")
save_weights_to_safetensors(weights, output_path)
def create_text_model_weights(
text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the text model with MoE structure."""
weights = {}
vocab_size = text_config["vocab_size"]
hidden_size = text_config["hidden_size"]
intermediate_size = text_config["intermediate_size"]
intermediate_size_mlp = text_config["intermediate_size_mlp"]
num_layers = text_config["num_hidden_layers"]
num_attention_heads = text_config["num_attention_heads"]
num_key_value_heads = text_config.get("num_key_value_heads",
num_attention_heads)
# MoE specific parameters
num_experts = text_config.get("num_local_experts")
assert (num_experts
is not None), "num_local_experts must be specified for MoE"
head_dim = hidden_size // num_attention_heads
# Embedding layers
weights["language_model.model.embed_tokens.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.float16)
# Transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"language_model.model.layers.{layer_idx}"
print(f"Creating weights for layer {layer_prefix}...")
# Self-attention weights (separate q, k, v projections)
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
print("Self-attention weights created.")
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
# 0,2,4,... are dense
interleave_step = text_config.get("interleave_moe_layer_step", 1)
is_moe_layer = (interleave_step > 0
and (layer_idx + 1) % interleave_step == 0)
if is_moe_layer:
# MoE layer structure
# 1. Router weights
weights[
f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16)
# 2. Individual expert weights (not fused)
for expert_idx in range(num_experts):
expert_prefix = (
f"{layer_prefix}.feed_forward.experts.{expert_idx}")
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
# Expert weight scales (FP8 quantization)
weights[
f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[
f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16)
# 3. Shared expert weights
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
print(f"MoE feed-forward weights created for layer {layer_idx}.")
else:
# Dense layer structure
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
torch.randn(hidden_size,
intermediate_size_mlp,
dtype=torch.bfloat16))
print(f"Dense feed-forward weights created for layer {layer_idx}.")
# Layer norms
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
print("Layer norms created.")
# Final layer norm and output projection
weights["language_model.model.norm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights["language_model.lm_head.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.bfloat16)
return weights
def create_vision_model_weights(
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the vision model."""
weights = {}
hidden_size = vision_config["hidden_size"]
intermediate_size = vision_config["intermediate_size"]
num_layers = vision_config["num_hidden_layers"]
# Vision transformer layers
for layer_idx in range(num_layers):
layer_prefix = f"vision_model.model.layers.{layer_idx}"
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
intermediate_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
return weights
def create_shared_weights(
text_config: dict[str, Any],
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create weights for shared components (vision-language connector)"""
weights = {}
text_hidden_size = text_config["hidden_size"]
projector_input_dim = vision_config["projector_input_dim"]
# Vision-language connector (projects vision features to text space)
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
return weights
def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
output_path: Path) -> None:
"""Save weights to safetensors files and create index."""
# Determine how to shard the weights
max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard
# Calculate sizes and create shards
shards = []
current_shard: dict[str, torch.Tensor] = {}
current_size = 0
for name, tensor in weights.items():
tensor_size = tensor.numel() * tensor.element_size()
if current_size + tensor_size > max_shard_size and current_shard:
shards.append(current_shard)
current_shard = {}
current_size = 0
current_shard[name] = tensor
current_size += tensor_size
if current_shard:
shards.append(current_shard)
# Save shards and create index
weight_map = {}
if len(shards) == 1:
# Single file
filename = "model.safetensors"
save_file(shards[0], output_path / filename)
weight_map = {name: filename for name in shards[0]}
print(f"Saved weights to single file: {filename}")
else:
# Multiple shards
for i, shard in enumerate(shards):
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
save_file(shard, output_path / filename)
for name in shard:
weight_map[name] = filename
print(f"Saved shard {i+1}/{len(shards)}: {filename}")
# Create index file
index_data = {
"metadata": {
"total_size":
sum(tensor.numel() * tensor.element_size()
for tensor in weights.values())
},
"weight_map": weight_map,
}
index_path = output_path / "model.safetensors.index.json"
with open(index_path, "w") as f:
json.dump(index_data, f, indent=2)
print(f"Created index file: {index_path}")
print(f"Total model size: "
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
def run_reduced_model(model_path: str,
should_profile: bool = False,
**kwargs) -> None:
"""Test the created reduced model with vLLM."""
print(f"\nTesting reduced model at {model_path}...")
llm = LLM(
model=model_path,
trust_remote_code=True,
max_model_len=512, # Small context for testing
gpu_memory_utilization=0.3, # Conservative memory usage
**kwargs,
)
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=50)
if should_profile:
llm.start_profile()
outputs = llm.generate(PROMPTS, sampling_params)
if should_profile:
llm.stop_profile()
print("Test generation successful!")
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: "
f"{output.outputs[0].text}")
print("-" * 40)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"original_model_name,text_layers,num_experts,vision_layers,",
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_dummy_maverick(
original_model_name: str,
text_layers: int,
num_experts: int,
vision_layers: int,
enforce_eager: bool,
tp: int,
ep: bool,
output_dir: str = "/tmp/reduced_maverick",
force_recreate: bool = True,
profile: bool = False,
) -> None:
model_path = create_reduced_maverick_model(
original_model_name=original_model_name,
output_dir=output_dir,
text_layers=text_layers,
num_experts=num_experts,
vision_layers=vision_layers,
force_recreate=force_recreate,
)
print(f"\nReduced model created successfully at: {model_path}")
run_reduced_model(model_path=model_path,
should_profile=profile,
enforce_eager=enforce_eager,
tensor_parallel_size=tp,
enable_expert_parallel=ep)
def main():
"""Main function to create and test the reduced model."""
import argparse
parser = argparse.ArgumentParser(
description="Create a reduced-layer Maverick model")
parser.add_argument(
"--output-dir",
default="/tmp/reduced_maverick",
help="Output directory for the reduced model",
)
parser.add_argument(
"--text-layers",
type=int,
default=4,
help="Number of text transformer layers",
)
parser.add_argument("--num-experts",
type=int,
default=4,
help="Number of experts")
parser.add_argument(
"--vision-layers",
type=int,
default=2,
help="Number of vision transformer layers",
)
parser.add_argument(
"--force-recreate",
action="store_true",
help="Force recreation if output directory exists",
)
parser.add_argument("--test",
action="store_true",
help="Test the created model with vLLM")
parser.add_argument("--profile",
action="store_true",
help="Profile the created model with vLLM")
parser.add_argument(
"--test-original",
action="store_true",
help="Test the original model with vLLM",
)
parser.add_argument(
"--original-model",
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
help="Original model name to base the reduction on",
)
args = parser.parse_args()
if args.test:
test_dummy_maverick(original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile)
if args.test_original:
run_maverick_serving(args.original_model)
if __name__ == "__main__":
exit(main())
......@@ -182,8 +182,7 @@ def test_chat(
) as vllm_model:
outputs = []
for msg in MSGS:
output = vllm_model.model.chat(msg,
sampling_params=SAMPLING_PARAMS)
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
......@@ -219,7 +218,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
max_model_len=8192,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = vllm_model.model.generate(prompt)
outputs = vllm_model.llm.generate(prompt)
assert len(outputs) == 1, f"{len(outputs)=}"
output: RequestOutput = outputs[0]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import pytest_asyncio
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
TextChunk, UserMessage)
from vllm.transformers_utils.tokenizer import MistralTokenizer
from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "mistral", "--config_format", "mistral",
"--load_format", "mistral"
]
@pytest.fixture()
def server(request, audio_assets: AudioTestAssets):
args = [
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}),
] + MISTRAL_FORMAT_ARGS
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
def _get_prompt(audio_assets, question):
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
audios = [
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
for i in range(len(audio_assets))
]
audio_chunks = [
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
]
text_chunk = TextChunk(text=question)
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
return tokenizer.apply_chat_template(messages=messages)
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_with_multiple_audios(vllm_runner,
audio_assets: AudioTestAssets, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate
for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tokenizer_mode="mistral",
)
@pytest.mark.asyncio
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
def asset_to_chunk(asset):
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
audio.format = "wav"
audio_dict = AudioChunk.from_audio(audio).to_openai()
return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
messages = [{
"role":
"user",
"content": [
*audio_chunks,
{
"type":
"text",
"text":
f"What's happening in these {len(audio_assets)} audio clips?"
},
],
}]
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
......@@ -107,7 +107,7 @@ def run_test(
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
sampling_params = SamplingParams(
temperature=0,
......
......@@ -85,7 +85,7 @@ def run_test(
enforce_eager=enforce_eager,
task=task,
**vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None:
......
......@@ -97,7 +97,7 @@ def _run_test(
dtype=dtype,
enforce_eager=True,
max_model_len=8192) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
texts = [
# this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Union
import pytest
from transformers import AutoModel
from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
from vllm.entrypoints.score_utils import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner
model_name = "jinaai/jina-reranker-m0"
mm_processor_kwargs = {
"min_pixels": 3136,
"max_pixels": 602112,
}
limit_mm_per_prompt = {"image": 2}
def vllm_reranker(
vllm_runner: type[VllmRunner],
model_name: str,
dtype: str,
query_strs: list[str],
document_strs: list[str],
query_type: str = "text",
doc_type: str = "text",
):
def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
return {"type": "image_url", "image_url": {"url": f"{url}"}}
query: Union[list[str], ScoreMultiModalParam]
if query_type == "text":
query = query_strs
elif query_type == "image":
query = ScoreMultiModalParam(
content=[create_image_param(url) for url in query_strs])
documents: Union[list[str], ScoreMultiModalParam]
if doc_type == "text":
documents = document_strs
elif doc_type == "image":
documents = ScoreMultiModalParam(
content=[create_image_param(url) for url in document_strs])
with vllm_runner(
model_name,
task="score",
dtype=dtype,
max_num_seqs=2,
max_model_len=2048,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
) as vllm_model:
outputs = vllm_model.llm.score(query, documents)
return [output.outputs.score for output in outputs]
def hf_reranker(
hf_runner: type[HfRunner],
model_name: str,
dtype: str,
query_strs: list[str],
document_strs: list[str],
query_type: str = "text",
doc_type: str = "text",
):
checkpoint_to_hf_mapper = {
"visual.": "model.visual.",
"model.": "model.language_model.",
}
data_pairs = [[query_strs[0], d] for d in document_strs]
with hf_runner(
model_name,
dtype=dtype,
trust_remote_code=True,
auto_cls=AutoModel,
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
) as hf_model:
return hf_model.model.compute_score(data_pairs,
max_length=2048,
query_type=query_type,
doc_type=doc_type)
# Visual Documents Reranking
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
query = ["slm markdown"]
documents = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"text", "image")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "text", "image")
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
# Textual Documents Reranking
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
query = ["slm markdown"]
documents = [
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"text", "text")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "text", "text")
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
# Image Querying for Textual Documents
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
query = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents = [
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"image", "text")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "image", "text")
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
# Image Querying for Image Documents
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
query = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
"image", "image")
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
documents, "image", "image")
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.utils import set_default_torch_num_threads
from ....conftest import VllmRunner
def generate_test_mm_data():
mm_data = {
"pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16),
"location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
}
return mm_data
def _run_test(
vllm_runner: type[VllmRunner],
model: str,
) -> None:
prompt = [
{
# This model deals with no text input
"prompt_token_ids": [1],
"multi_modal_data": generate_test_mm_data(),
} for _ in range(10)
]
with (
set_default_torch_num_threads(1),
vllm_runner(
model,
task="embed",
dtype=torch.float16,
enforce_eager=True,
skip_tokenizer_init=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
) as vllm_model,
):
vllm_model.encode(prompt)
MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
) -> None:
_run_test(
vllm_runner,
model,
)
......@@ -161,6 +161,7 @@ def _test_processing_correctness(
_ADD_SPECIAL_TOKENS_OVERRIDES = {
"mllama": False,
"ovis": False,
"paligemma": False,
"ultravox": False,
"whisper": False,
}
......@@ -291,7 +292,8 @@ def _test_processing_correctness_one(
os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-VL-01"),
os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"),
os.path.join(models_path_prefix, "allenai/Molmo-7B-O-0924"),
os.path.join(models_path_prefix, "nvidia/NVLM-D-72B"),
os.path.join(models_path_prefix, "nvidia/NVLM-D-72B"),
os.path.join(models_path_prefix, "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"),
os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B"),
os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Llama3.2-3B"),
os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B"),
......@@ -302,7 +304,7 @@ def _test_processing_correctness_one(
os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"),
os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"),
os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"),
os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"),
os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"),
os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B"),
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
from typing import Optional
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import ImageTestAssets
from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.internvl import (
calculate_internvl_targets, get_internvl_target_ratios)
width, height = image.size
blocks, _, _ = calculate_internvl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_internvl_target_ratios(
min_num,
max_num,
),
image_size=config.force_image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
image_processor = processor.info.get_image_processor()
config.use_thumbnail = image_processor.use_thumbnail
prompt = "<image>" * len(images)
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
print(total_expected_num_patches)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id",
["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: ImageTestAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
kwargs_on_init: bool,
):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
min_num,
max_num,
hf_processor_mm_kwargs,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf: disable
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_multimodal_processor(model_id):
model_config = ModelConfig(
model=model_id,
model_impl="transformers",
)
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
image_pil = ImageAsset('cherry_blossom').pil_image
mm_data = {"image": image_pil}
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
str_processed_inputs = mm_processor.apply(
prompt=str_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
ids_prompt = [
151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168,
30, 151645, 151644, 77091, 198
]
ids_processed_inputs = mm_processor.apply(
prompt=ids_prompt,
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]
......@@ -17,8 +17,8 @@ from ..utils import models_path_prefix
from vllm.platforms import current_platform
from ..models.utils import check_embeddings_close
from ..utils import compare_two_settings, create_new_process_for_each_test
from ...utils import compare_two_settings, multi_gpu_test
from ..utils import check_embeddings_close, check_logprobs_close
models_4bit_to_test = [
(os.path.join(models_path_prefix, "facebook/opt-125m"), "quantize opt model inflight"),
......@@ -30,6 +30,10 @@ models_4bit_to_embedding_test = [
("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
]
models_4bit_to_moe_test = [
("allenai/OLMoE-1B-7B-0125-Instruct", "quantize moe model inflight"),
]
models_pre_qaunt_4bit_to_test = [
(os.path.join(models_path_prefix, 'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'),
'read pre-quantized 4-bit FP4 model'),
......@@ -46,7 +50,6 @@ models_pre_quant_8bit_to_test = [
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@create_new_process_for_each_test()
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -60,7 +63,6 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_qaunt_4bit_to_test)
@create_new_process_for_each_test()
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -72,7 +74,6 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_quant_8bit_to_test)
@create_new_process_for_each_test()
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -80,12 +81,11 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, True)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason='Test requires at least 2 GPUs.')
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=2)
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -100,12 +100,10 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
vllm_tp_size=2)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason='Test requires at least 2 GPUs.')
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=2)
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
common_args = [
"--disable-log-stats",
......@@ -126,12 +124,40 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
compare_two_settings(model_name, common_args, pp_args)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
))
with vllm_runner(model_name,
quantization='bitsandbytes',
enforce_eager=False) as llm:
vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
max_tokens=32,
num_logprobs=5)
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
transformers_outputs = llm.generate_greedy_logprobs_limit(
example_prompts, max_tokens=32, num_logprobs=5)
check_logprobs_close(
outputs_0_lst=transformers_outputs,
outputs_1_lst=vllm_outputs,
name_0="transformers",
name_1="vllm",
)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_4bit_to_embedding_test)
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
def test_4bit_bnb_embedding_model(
model_name,
description,
......@@ -150,6 +176,13 @@ def test_4bit_bnb_embedding_model(
example_prompts = [str(s).strip() for s in example_prompts]
# Inflight 4bit quantization
with vllm_runner(model_name,
task="embed",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes") as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
load_in_4bit=True))
with hf_runner(
......@@ -160,12 +193,6 @@ def test_4bit_bnb_embedding_model(
) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model_name,
task="embed",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes") as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
......@@ -193,7 +220,8 @@ def validate_generated_texts(hf_runner,
model_name,
pre_quant=False,
hf_model_kwargs=None,
vllm_tp_size=1):
vllm_tp_size=1,
max_tokens=8):
# NOTE: run vLLM first, as it requires a clean process
# when using distributed inference
......@@ -201,7 +229,8 @@ def validate_generated_texts(hf_runner,
quantization=None if pre_quant else 'bitsandbytes',
tensor_parallel_size=vllm_tp_size,
enforce_eager=False) as llm:
vllm_outputs = llm.generate_greedy(prompts, 8)
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
# Clean up the GPU memory for the next test
......@@ -213,19 +242,17 @@ def validate_generated_texts(hf_runner,
# Run with HF runner
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
hf_outputs = llm.generate_greedy(prompts, 8)
hf_outputs = llm.generate_greedy(prompts, max_tokens)
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
# Clean up the GPU memory for the next test
gc.collect()
torch.cuda.empty_cache()
# Compare the generated strings
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
hf_str = hf_log["generated_text"]
vllm_str = vllm_log["generated_text"]
prompt = hf_log["prompt"]
assert hf_str == vllm_str, (f"Model: {model_name}"
f"Mismatch between HF and vLLM outputs:\n"
f"Prompt: {prompt}\n"
......
......@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
llm = LLM(
model=model_name,
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
......@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
del llm
print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name]
......
......@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason="modelopt_fp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
llm = LLM(
model=model_name,
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
......@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
del llm
print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name]
......
......@@ -139,16 +139,20 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"AquilaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/AquilaChat2-7B"),
trust_remote_code=True),
"ArceeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "arcee-ai/AFM-4.5B-Base"),
is_available_online=False),
"ArcticForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-instruct"),
trust_remote_code=True),
"BaiChuanForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B"),
trust_remote_code=True),
"BaichuanForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baichuan-inc/Baichuan2-7B-chat"),
trust_remote_code=True),
"BambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ibm-ai-platform/Bamba-9B"),
extras={"tiny": os.path.join(models_path_prefix,"hmellor/tiny-random-BambaForCausalLM")}), # noqa: E501
"BloomForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"bigscience/bloom-560m"),
{"1b": os.path.join(models_path_prefix,"bigscience/bloomz-1b1")}),
"BailingMoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "inclusionAI/Ling-lite-1.5"),
trust_remote_code=True),
"BambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"),
extras={"tiny": os.path.join(models_path_prefix, "hmellor/tiny-random-BambaForCausalLM")}), # noqa: E501
"BloomForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "bigscience/bloom-560m"),
{"1b": os.path.join(models_path_prefix, "bigscience/bloomz-1b1")}),
"ChatGLMModel": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/chatglm3-6b"),
trust_remote_code=True,
max_transformers_version="4.48"),
......@@ -166,14 +170,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"DeepseekV3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V3"), # noqa: E501
trust_remote_code=True),
"Ernie4_5_ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-0.3B-PT"),
trust_remote_code=True),
"Ernie4_5_MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-21B-A3B-PT"),
trust_remote_code=True),
"ExaoneForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")), # noqa: E501
"Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"mgleize/fairseq2-dummy-Llama-3.2-1B")), # noqa: E501
"FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-7b")),
"FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"),
"Ernie4_5_ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baidu/ERNIE-4.5-0.3B-PT"),
min_transformers_version="4.54"),
"Ernie4_5_MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baidu/ERNIE-4.5-21B-A3B-PT"),
min_transformers_version="4.54"),
"ExaoneForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")), # noqa: E501
"Exaone4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-4.0-32B")), # noqa: E501
"Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mgleize/fairseq2-dummy-Llama-3.2-1B")), # noqa: E501
"FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-7b")),
"FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/Falcon-H1-0.5B-Base"),
min_transformers_version="4.53"),
"GemmaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-1.1-2b-it")),
"Gemma2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-2-9b")),
......@@ -198,7 +203,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"HunYuanMoEV1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tencent/Hunyuan-A13B-Instruct"),
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"internlm/internlm-chat-7b"),
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix, "tencent/Hunyuan-7B-Instruct-0124"),
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm-chat-7b"),
trust_remote_code=True),
"InternLM2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"),
trust_remote_code=True),
......@@ -222,6 +229,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"MiniCPM3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
trust_remote_code=True),
"MiniMaxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01-hf"),
min_transformers_version="4.53"),
"MiniMaxText01ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01"),
trust_remote_code=True,
revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"), # noqa: E501
......@@ -243,14 +252,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{"1b": os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b")}),
"OrionForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"),
trust_remote_code=True),
"PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"adept/persimmon-8b-chat")),
"PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/phi-2")),
"Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-mini-4k-instruct")),
# Blocksparse attention not supported in V1 yet
"Phi3SmallForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-small-8k-instruct"),
trust_remote_code=True,
v0_only=True),
"PhiMoEForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3.5-MoE-instruct"),
"PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "adept/persimmon-8b-chat")),
"PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/phi-2")),
"Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-mini-4k-instruct")),
"Phi4FlashForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-4-mini-flash-reasoning"), # noqa: E501
trust_remote_code=True,
v0_only=True,
max_model_len=10240),
"PhiMoEForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
trust_remote_code=True),
"Plamo2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "pfnet/plamo-2-1b"),
trust_remote_code=True),
......@@ -258,16 +267,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"Qwen2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2-0.5B-Instruct"),
extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
"Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen1.5-MoE-A2.7B-Chat")),
"Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-8B")),
"Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-30B-A3B")),
"Qwen3ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"tomaarsen/Qwen3-Reranker-0.6B-seq-cls")), # noqa: E501
"RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-40b")),
"StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-zephyr-3b")), # noqa: E501
"StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-3b-4e1t")),
"Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"bigcode/starcoder2-3b")),
"SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"upstage/solar-pro-preview-instruct")),
"TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Tele-AI/TeleChat2-3B"),
"Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat")),
"Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-8B")),
"Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B")),
"RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-40b")),
"StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-zephyr-3b")), # noqa: E501
"StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t")),
"Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")),
"SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct")),
"TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Tele-AI/TeleChat2-3B"),
trust_remote_code=True),
"TeleFLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "CofeAI/FLM-2-52B-Instruct-2407"),
trust_remote_code=True),
......@@ -290,28 +298,27 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
_EMBEDDING_EXAMPLE_MODELS = {
# [Text-only]
"BertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-base-en-v1.5"), v0_only=True),
"Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-multilingual-gemma2"), v0_only=True), # noqa: E501
"GPT2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"nie3e/sentiment-polish-gpt2-small")), # noqa: E501
"GritLM": _HfExamplesInfo(os.path.join(models_path_prefix,"parasail-ai/GritLM-7B-vllm")),
"GteModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Snowflake/snowflake-arctic-embed-m-v2.0"),
"BertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"), v0_only=True),
"Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"), v0_only=True), # noqa: E501
"GritLM": _HfExamplesInfo(os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")),
"GteModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-embed-m-v2.0"),
trust_remote_code=True),
"GteNewModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-base-en-v1.5"),
trust_remote_code=True,
hf_overrides={"architectures": ["GteNewModel"]}), # noqa: E501
"InternLM2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-1_8b-reward"),
trust_remote_code=True),
"JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-reward-dev")), # noqa: E501
"LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"llama"), is_available_online=False),
"MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/e5-mistral-7b-instruct")),
"ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Alibaba-NLP/gte-modernbert-base"),
"JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-reward-dev")), # noqa: E501
"LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "llama"), is_available_online=False),
"MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")),
"ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-modernbert-base"),
trust_remote_code=True, v0_only=True),
"NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"nomic-ai/nomic-embed-text-v2-moe"),
"NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "nomic-ai/nomic-embed-text-v2-moe"),
trust_remote_code=True, v0_only=True), # noqa: E501
"Qwen2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"ssmits/Qwen2-7B-Instruct-embed-base")),
"Qwen2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-RM-72B")),
"Qwen2ForProcessRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-PRM-7B")),
"Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"jason9693/Qwen2.5-1.5B-apeach")), # noqa: E501
"RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/stsb-roberta-base-v2"), v0_only=True), # noqa: E501
"RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/all-roberta-large-v1"), v0_only=True), # noqa: E501
"XLMRobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/multilingual-e5-small"), v0_only=True), # noqa: E501
......@@ -324,12 +331,27 @@ _EMBEDDING_EXAMPLE_MODELS = {
is_available_online=False), # noqa: E501
}
_CROSS_ENCODER_EXAMPLE_MODELS = {
# [Text-only]
_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
# [Decoder-only]
"GPT2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "nie3e/sentiment-polish-gpt2-small")), # noqa: E501
# [Cross-encoder]
"BertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), v0_only=True), # noqa: E501
"ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base"), v0_only=True), # noqa: E501
"RobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/quora-roberta-base"), v0_only=True), # noqa: E501
"XLMRobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), v0_only=True), # noqa: E501
"ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base"), v0_only=True), # noqa: E501
}
_AUTOMATIC_CONVERTED_MODELS = {
# Use as_seq_cls_model for automatic conversion
"GemmaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-gemma)", # noqa: E501
v0_only=True,
hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
"classifier_from_token": ["Yes"], # noqa: E501
"method": "no_post_processing"}), # noqa: E501
"LlamaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Skywork/Skywork-Reward-V2-Llama-3.2-1B")), # noqa: E501
"Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach")), # noqa: E501
"Qwen3ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "tomaarsen/Qwen3-Reranker-0.6B-seq-cls")), # noqa: E501
}
_MULTIMODAL_EXAMPLE_MODELS = {
......@@ -350,12 +372,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"GLM4VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/glm-4v-9b"),
trust_remote_code=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
"Glm4vForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.1V-9B-Thinking"), min_transformers_version="4.53"), # noqa: E501
"Glm4MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.5"),
"Glm4vForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/GLM-4.1V-9B-Thinking"), min_transformers_version="4.53"), # noqa: E501
"Glm4MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/GLM-4.5"),
min_transformers_version="4.54",
is_available_online=False), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
extras={"2b": os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-2b")}, # noqa: E501
"H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
extras={"2b": os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b")}, # noqa: E501
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible."), # noqa: E501
"InternVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
......@@ -364,12 +386,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501
{"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")}), # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
"KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"), # noqa: E501
trust_remote_code=True),
"KimiVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"), # noqa: E501
extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")}, # noqa: E501
trust_remote_code=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
"Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"), # noqa: E501
max_model_len=10240),
"LlavaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
extras={"mistral": os.path.join(models_path_prefix, "mistral-community/pixtral-12b"), # noqa: E501
......@@ -398,9 +420,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"NVLM_D": _HfExamplesInfo(os.path.join(models_path_prefix, "nvidia/NVLM-D-72B"),
trust_remote_code=True),
"Llama_Nemotron_Nano_VL" : _HfExamplesInfo(os.path.join(models_path_prefix, "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"), # noqa: E501
trust_remote_code=True),
"PaliGemmaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/paligemma-3b-mix-224"), # noqa: E501
extras={"v2": os.path.join(models_path_prefix, "google/paligemma2-3b-ft-docci-448")}), # noqa: E501
"Phi3VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"),
"Phi3VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True,
max_transformers_version="4.48",
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
......@@ -418,7 +443,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")), # noqa: E501
"Qwen2VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501
max_model_len=4096),
"Qwen2_5OmniModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B")),
"Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-7B-AWQ")), # noqa: E501
"SkyworkR1VChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B")),
......@@ -429,6 +455,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
"VoxtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Voxtral-Mini-3B-2507",
min_transformers_version="4.54",
# disable this temporarily until we support HF format
is_available_online=False,
),
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
......@@ -436,17 +468,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer=os.path.join(models_path_prefix,"Isotr0py/Florence-2-tokenizer"), # noqa: E501
trust_remote_code=True), # noqa: E501
"MllamaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct")), # noqa: E501
"Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct")), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "openai/whisper-large-v3")), # noqa: E501
# [Cross-encoder]
"JinaVLForRanking": _HfExamplesInfo(os.path.join(models_path_prefix, "jinaai/jina-reranker-m0")), # noqa: E501
}
_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EAGLEModel": _HfExamplesInfo(os.path.join(models_path_prefix, "JackFram/llama-68m"),
speculative_model=os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random")), # noqa: E501
"MedusaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "JackFram/llama-68m"),
speculative_model=os.path.join(models_path_prefix, "abhigoyal/vllm-medusa-llama-68m-random")), # noqa: E501
"MLPSpeculatorPreTrainedModel": _HfExamplesInfo(os.path.join(models_path_prefix, "JackFram/llama-160m"),
speculative_model=os.path.join(models_path_prefix, "ibm-ai-platform/llama-160m-accelerator")), # noqa: E501
# Temporarily disabled.
# TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
# speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501
"DeepSeekMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random"),
speculative_model=os.path.join(models_path_prefix, "luccafong/deepseek_mtp_draft_random"), # noqa: E501
trust_remote_code=True),
......@@ -454,32 +488,39 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code=True,
speculative_model=os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
tokenizer=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct")), # noqa: E501
"Eagle3LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"), # noqa: E501
"Eagle3LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")), # noqa: E501
trust_remote_code=True,
speculative_model=os.path.join(models_path_prefix,"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"),
tokenizer=os.path.join(models_path_prefix,"meta-llama/Llama-3.1-8B-Instruct")),
"EagleMiniCPMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"openbmb/MiniCPM-1B-sft-bf16"),
speculative_model=os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"),
tokenizer=os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")),
"EagleLlama4ForCausalLM": _HfExamplesInfo(
os.path.join(models_path_prefix, "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"),
trust_remote_code=True,
speculative_model=os.path.join(models_path_prefix, "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"),
tokenizer=os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct")), # noqa: E501
"EagleMiniCPMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "openbmb/MiniCPM-1B-sft-bf16"),
trust_remote_code=True,
is_available_online=False,
speculative_model=os.path.join(models_path_prefix,"openbmb/MiniCPM-2B-sft-bf16"),
tokenizer=os.path.join(models_path_prefix,"openbmb/MiniCPM-2B-sft-bf16")),
"Glm4MoeMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.5"),
speculative_model=os.path.join(models_path_prefix,"THUDM/GLM-4.5"),
speculative_model=os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16"),
tokenizer=os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16")),
"Glm4MoeMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/GLM-4.5"),
speculative_model=os.path.join(models_path_prefix, "THUDM/GLM-4.5"),
min_transformers_version="4.54",
is_available_online=False),
"MiMoMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix,"XiaomiMiMo/MiMo-7B-RL"),
"MiMoMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"),
trust_remote_code=True,
speculative_model=os.path.join(models_path_prefix,"XiaomiMiMo/MiMo-7B-RL"))
}
_TRANSFORMERS_MODELS = {
"TransformersForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "ArthurZ/Ilama-3.2-1B"), trust_remote_code=True), # noqa: E501
"TransformersForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"), trust_remote_code=True), # noqa: E501
"TransformersForMultimodalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")),
}
_EXAMPLE_MODELS = {
**_TEXT_GENERATION_EXAMPLE_MODELS,
**_EMBEDDING_EXAMPLE_MODELS,
**_CROSS_ENCODER_EXAMPLE_MODELS,
**_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
**_MULTIMODAL_EXAMPLE_MODELS,
**_SPECULATIVE_DECODING_EXAMPLE_MODELS,
**_TRANSFORMERS_MODELS,
......@@ -511,4 +552,5 @@ class HfExampleModels:
raise ValueError(f"No example model defined for {model_id}")
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
\ No newline at end of file
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
AUTO_EXAMPLE_MODELS = HfExampleModels(_AUTOMATIC_CONVERTED_MODELS)
......@@ -12,20 +12,36 @@ from vllm.utils import GiB_bytes
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
from vllm.v1.engine.core import EngineCore as V1EngineCore
from .registry import HF_EXAMPLE_MODELS
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
from ..utils import create_new_process_for_each_test
from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
@create_new_process_for_each_test()
def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
EXAMPLE_MODELS: HfExampleModels):
"""The reason for using create_new_process_for_each_test is to avoid
the WARNING:
"We must use the 'spawn' multiprocessing start method. Overriding
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'."
The spawn process causes the _initialize_kv_caches_v1 function below to
become ineffective.
"""
model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# FIXME: Possible memory leak in the previous tests?
if model_arch in ("GraniteSpeechForConditionalGeneration",
if model_arch in ("Glm4vForConditionalGeneration",
"GraniteSpeechForConditionalGeneration",
"KimiVLForConditionalGeneration"):
pytest.skip("Avoid OOM")
if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
from vllm.model_executor.models.llama4 import Llama4ForCausalLM
from vllm.model_executor.models.registry import ModelRegistry
ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
# Avoid OOM and reduce initialization time by only using 1 layer
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
hf_config.update(model_info.hf_overrides)
......@@ -33,13 +49,18 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
text_config = hf_config.get_text_config()
# Ensure at least 2 expert per group
# Since `grouped_topk` assums top-2
# Since `grouped_topk` assumes top-2
n_group = getattr(text_config, 'n_group', None)
num_experts = n_group * 2 if n_group is not None else 2
# we use three layers for Gemma-3n to check
# both normal layer and kv_shared_layer
num_hidden_layers = (3 if model_arch
== "Gemma3nForConditionalGeneration" else 1)
text_config.update({
"num_layers": 1,
"num_hidden_layers": 1,
"num_hidden_layers": num_hidden_layers,
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_local_experts": num_experts,
......@@ -47,6 +68,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
"first_k_dense_replace": 0,
# To avoid OOM on DeepSeek-V3
"n_routed_experts": num_experts,
# For Gemma-3n
"num_kv_shared_layers": 1,
})
if hasattr(hf_config, "vision_config"):
......@@ -86,6 +109,9 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
_initialize_kv_caches_v1), monkeypatch.context() as m):
if model_info.v0_only:
m.setenv("VLLM_USE_V1", "0")
if model_arch == "Phi4FlashForCausalLM":
# Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
LLM(
model_info.default,
tokenizer=model_info.tokenizer,
......@@ -102,3 +128,15 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
load_format="dummy",
hf_overrides=hf_overrides,
)
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
@pytest.mark.parametrize("model_arch",
AUTO_EXAMPLE_MODELS.get_supported_archs())
def test_implicit_converted_models(model_arch: str,
monkeypatch: pytest.MonkeyPatch):
can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)
......@@ -80,11 +80,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
(os.path.join(models_path_prefix, "MLPSpeculatorPreTrainedModel"), False, False),
(os.path.join(models_path_prefix, "DeepseekV2ForCausalLM"), True, False),
(os.path.join(models_path_prefix, "Qwen2VLForConditionalGeneration"), True, True),
])
@pytest.mark.parametrize(
"model_arch,is_pp,init_cuda",
[
# TODO(woosuk): Re-enable this once the MLP Speculator is supported
# in V1.
# ("MLPSpeculatorPreTrainedModel", False, False),
(os.path.join(models_path_prefix, "DeepseekV2ForCausalLM"), True, False),
(os.path.join(models_path_prefix, "Qwen2VLForConditionalGeneration"), True, True),
])
def test_registry_is_pp(model_arch, is_pp, init_cuda):
assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
......
......@@ -57,8 +57,8 @@ def check_implementation(
@pytest.mark.parametrize(
"model,model_impl",
[
(os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"), "transformers"),
(os.path.join(models_path_prefix,"ArthurZ/Ilama-3.2-1B", "auto")), # CUSTOM CODE
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "transformers"),
(os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"), "auto"), # CUSTOM CODE
]) # trust_remote_code=True by default
def test_models(
hf_runner: type[HfRunner],
......@@ -105,7 +105,7 @@ def test_distributed(
reason="bitsandbytes quantization is currently not supported in rocm.")
@pytest.mark.parametrize("model, quantization_kwargs", [
(
os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
{
"quantization": "bitsandbytes",
},
......@@ -139,4 +139,39 @@ def test_quantization(
outputs_1_lst=vllm_outputs,
name_0="transformers",
name_1="vllm",
)
\ No newline at end of file
)
@pytest.mark.parametrize(
"model",
[os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach")],
)
@pytest.mark.parametrize("dtype", ["float"])
def test_classify(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
monkeypatch,
) -> None:
import torch
from transformers import AutoModelForSequenceClassification
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
model_impl="transformers") as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
hf_outputs = hf_model.classify(example_prompts)
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment