Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
......@@ -11,8 +11,8 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
native_w8a8_block_matmul)
from vllm.config import VllmConfig
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
w8a8_block_fp8_matmul)
cutlass_scaled_mm, get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
......@@ -98,6 +98,54 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert rel_diff < 0.001
@torch.inference_mode()
def test_w8a8_block_fp8_cutlass_matmul():
# Test simple case where weight.shape % 128 != 0,
# like in DSV3 kv_a_proj_with_mqa
M = 32
N = 576
K = 7168
block_size = [128, 128]
out_dtype = torch.bfloat16
seed = 0
torch.manual_seed(seed)
factor_for_scale = 1e-2
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
# Hopper requires row-major format for scales
Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(
90) else Bs
A_fp8, As = per_token_group_quant_fp8(A_fp32,
block_size[1],
column_major_scales=False)
# CUTLASS uses column-major format for scales
A_fp8_cutlass, As_cutlass = per_token_group_quant_fp8(
A_fp32, block_size[1], column_major_scales=True)
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
out = cutlass_scaled_mm(A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass,
block_size, out_dtype)
rel_diff = (torch.mean(
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
torch.mean(torch.abs(ref_out.to(torch.float32))))
assert rel_diff < 0.001
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
......
......@@ -111,6 +111,49 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
def onednn_gemm_test_helper(primitive_cache_size: int,
m: int,
n: int,
k: int,
use_bias: bool,
use_stride: bool,
dtype: torch.dtype = torch.bfloat16,
device: str = "cpu"):
if use_stride:
a = torch.rand((m, 2 * k), dtype=dtype, device=device) * 1.5
a = a[:, :k]
else:
a = torch.rand((m, k), dtype=dtype, device=device) * 1.5
b = torch.rand((n, k), dtype=dtype, device=device) * 1.5
if use_bias:
bias = torch.rand((n, ), device=device, dtype=dtype) * 5
bias_f32 = bias.float()
else:
bias = None
bias_f32 = None
handler = ops.create_onednn_mm(
b.t(),
primitive_cache_size,
)
out = ops.onednn_mm(handler, a, bias)
baseline = torch.nn.functional.linear(a.float(), b.float(),
bias_f32).to(dtype=a.dtype)
torch.testing.assert_close(out, baseline)
if use_bias:
# To test runtime bias setting
out = ops.onednn_mm(handler, a, None)
baseline = torch.nn.functional.linear(a.float(), b.float(),
None).to(dtype=a.dtype)
torch.testing.assert_close(out, baseline)
@pytest.mark.parametrize("n,k", NK_FACTORS)
@pytest.mark.parametrize("m_list", M_FACTORS)
@pytest.mark.parametrize("per_tensor_a_scale", [True, False])
......@@ -142,3 +185,30 @@ def test_onednn_int8_scaled_gemm(
use_azp=use_azp,
out_dtype=output_type,
)
@pytest.mark.parametrize("n,k", NK_FACTORS)
@pytest.mark.parametrize("m_list", M_FACTORS)
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("use_stride", [True, False])
@pytest.mark.parametrize("dtype", DTYPE)
@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
def test_onednn_gemm(
n: int,
k: int,
m_list: tuple[int],
use_bias: bool,
use_stride: bool,
dtype: torch.dtype,
primitive_cache_size: int,
):
for m in m_list:
onednn_gemm_test_helper(
primitive_cache_size=primitive_cache_size,
m=m,
n=n,
k=k,
use_bias=use_bias,
use_stride=use_stride,
dtype=dtype,
)
......@@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
# We treat N-dimensional group scaling as extended numpy-style broadcasting
# in numpy simply stretches dimensions with an extent of 1 to match the
# in numpy simply stretches dimensions with an extent of 1 to match
# the target shape by repeating the data along that dimension (broadcasting)
# , we extend these semantics to say if the extent of a dimension in the
# source shape is not 1 and does not match the target shape we repeat each
......@@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
# then we would expand a to:
# a = [[1, 1, 2, 2],
# [3, 3, 4, 4]]
# NOTE this function this function does not explicitly broadcast dimensions
# NOTE this function does not explicitly broadcast dimensions
# with an extent of 1, since this can be done implicitly by pytorch
def group_broadcast(t, shape):
for i, s in enumerate(shape):
......
......@@ -128,7 +128,7 @@ if __name__ == "__main__":
print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig(
kv_connector='PyNcclConnector',
kv_connector='P2pNcclConnector',
kv_buffer_device='cuda',
kv_buffer_size=1e9,
kv_rank=my_rank,
......
......@@ -137,7 +137,7 @@ if __name__ == "__main__":
)
config = KVTransferConfig(
kv_connector='PyNcclConnector',
kv_connector='P2pNcclConnector',
kv_buffer_device='cuda',
kv_buffer_size=1e9,
kv_rank=my_rank,
......
......@@ -59,10 +59,10 @@ async def requests_processing_time(llm,
@pytest.mark.asyncio
async def test_add_lora(chatglm3_lora_files):
"""
The add_lora function is used to pre-load some LoRA adapters into the
The add_lora function is used to preload some LoRA adapters into the
engine in anticipation of future requests using these adapters. To test
this functionality, we use the async engine to process some requests - We
do it twice, once with add_lora() pre-loading and once without.
do it twice, once with add_lora() preloading and once without.
We measure the request processing time in both cases and expect the time
to be lesser in the case with add_lora() calls.
......
......@@ -11,21 +11,21 @@ import pytest
import torch
import torch.nn.functional as F
from vllm.config import LoRAConfig
from vllm.lora.fully_sharded_layers import (
ColumnParallelLinearWithShardedLoRA,
MergedColumnParallelLinearWithShardedLoRA,
MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
RowParallelLinearWithShardedLoRA)
from vllm.config.lora import LoRAConfig
# yapf conflicts with isort for this block
# yapf: disable
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
ColumnParallelLinearWithShardedLoRA,
LogitsProcessorWithLoRA, LoRAMapping,
MergedColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithShardedLoRA,
MergedQKVParallelLinearWithLoRA,
MergedQKVParallelLinearWithShardedLoRA,
QKVParallelLinearWithLoRA,
QKVParallelLinearWithShardedLoRA,
ReplicatedLinearWithLoRA,
RowParallelLinearWithLoRA,
RowParallelLinearWithShardedLoRA,
VocabParallelEmbeddingWithLoRA)
# yapf: enable
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
......@@ -60,9 +60,9 @@ DEVICES = ([
# prefill stage(True) or decode stage(False)
STAGES = [True, False]
NUM_RANDOM_SEEDS = 6
NUM_RANDOM_SEEDS = 2
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 2
@pytest.fixture(autouse=True)
......
......@@ -3,8 +3,8 @@
import pytest
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
VllmConfig)
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
......@@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
adapters that define additional tokens.
"""
# Setup a base model compatible with the sql_lora_files adapter and
# Set up a base model compatible with the sql_lora_files adapter and
# a known number of tokens in the base model.
model_config = ModelConfig(
model=llama_2_7b_base_huggingface_id,
......@@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
adapters that do not define additional tokens.
"""
# Setup a base model compatible with the qwen25vl_lora_files adapter and
# Set up a base model compatible with the qwen25vl_lora_files adapter and
# a known number of tokens in the base model.
model_config = ModelConfig(
model=qwen25vl_base_huggingface_id,
......
......@@ -8,7 +8,7 @@ import torch
from safetensors.torch import load_file
from torch import nn
from vllm.config import LoRAConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA)
......
......@@ -7,7 +7,7 @@ import shutil
import pytest
from vllm.config import LoRAConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.peft_helper import PEFTHelper
ERROR_CASES = [
......
......@@ -6,9 +6,10 @@ import random
import tempfile
from unittest.mock import patch
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, ParallelConfig, SchedulerConfig,
VllmConfig)
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig)
from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker
......
......@@ -4,7 +4,8 @@
import pytest
from torch import nn
from vllm.config import LoadConfig, ModelConfig
from vllm.config import ModelConfig
from vllm.config.load import LoadConfig
from vllm.model_executor.model_loader import (get_model_loader,
register_model_loader)
from vllm.model_executor.model_loader.base_loader import BaseModelLoader
......
......@@ -13,13 +13,15 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
vllm_topk_softmax)
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
is_rocm_aiter_moe_enabled)
from vllm.model_executor.layers.layernorm import (
RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
from vllm.model_executor.layers.layernorm import (RMSNorm,
dispatch_rocm_rmsnorm_func,
fused_add_rms_norm, rms_norm)
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
# Registered subclass for test
@CustomOp.register("relu3")
......@@ -149,24 +151,27 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
@pytest.mark.parametrize("add_residual", [True, False])
@pytest.mark.parametrize("dtype",
[torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
@pytest.mark.skipif(not current_platform.is_rocm(),
reason="AITER is a feature exclusive for ROCm")
def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
use_rocm_aiter_norm: str, monkeypatch):
def test_rms_norm_dispatch(add_residual: bool, dtype: torch.dtype,
use_rocm_aiter: str, use_rocm_aiter_norm: str,
monkeypatch):
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
if not add_residual:
if current_platform.is_rocm() and int(use_rocm_aiter) and int(
use_rocm_aiter_norm):
assert rms_norm_func == rocm_aiter_rms_norm
else:
assert rms_norm_func == rms_norm
elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
use_rocm_aiter_norm):
assert rms_norm_func == rocm_aiter_fused_add_rms_norm
else:
rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype)
should_use_rocm_aiter = current_platform.is_rocm() and int(use_rocm_aiter) \
and int(use_rocm_aiter_norm) and dtype in RMS_NORM_SUPPORTED_DTYPES
if add_residual and should_use_rocm_aiter:
assert rms_norm_func == torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
elif should_use_rocm_aiter:
assert rms_norm_func == torch.ops.vllm.rocm_aiter_rms_norm
elif add_residual:
assert rms_norm_func == fused_add_rms_norm
else:
assert rms_norm_func == rms_norm
......@@ -178,6 +178,7 @@ def run_test(
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@pytest.mark.skip(reason="bart not supported in V1")
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
......@@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
@pytest.mark.skip(reason="bart not supported in V1")
def test_models_distributed(hf_runner, vllm_runner,
example_encoder_decoder_prompts,
distributed_executor_backend, model, dtype,
......
......@@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
# These have unsupported head_dim for FA. We do not
# not have a clean way to fall back, so we fail with
# have a clean way to fall back, so we fail with
# a clear msg when it happens.
# https://github.com/vllm-project/vllm/issues/14524
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
......@@ -93,7 +93,7 @@ AITER_MODEL_LIST = [
"allenai/OLMoE-1B-7B-0924-Instruct",
marks=[pytest.mark.cpu_model],
),
pytest.param("swiss-ai/Apertus-8B"), # apertus
pytest.param("swiss-ai/Apertus-8B-2509"), # apertus
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
......
......@@ -25,8 +25,7 @@ SSM_MODELS = [
HYBRID_MODELS = [
"ai21labs/Jamba-tiny-dev",
# skipping until vLLM implementation issues are resolved
# "pfnet/plamo-2-1b",
"pfnet/plamo-2-1b",
"Zyphra/Zamba2-1.2B-instruct",
"hmellor/tiny-random-BambaForCausalLM",
"ibm-granite/granite-4.0-tiny-preview",
......@@ -34,20 +33,10 @@ HYBRID_MODELS = [
"LiquidAI/LFM2-1.2B",
]
HF_UNSUPPORTED_MODELS = [
# The HF transformers implementation of
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
# doesn't compare vLLM output with HF output.
# See https://github.com/huggingface/transformers/pull/35943
"yujiepan/mamba2-codestral-v0.1-tiny-random",
# transformers 4.55 is still producing garbage for this model
# TODO(tdoublep): follow-up on transformers side
"ibm-granite/granite-4.0-tiny-preview"
]
V1_SUPPORTED_MODELS = [
"state-spaces/mamba-130m-hf",
"ai21labs/Jamba-tiny-dev",
"pfnet/plamo-2-1b",
"yujiepan/mamba2-codestral-v0.1-tiny-random",
"Zyphra/Zamba2-1.2B-instruct",
"hmellor/tiny-random-BambaForCausalLM",
......@@ -58,6 +47,7 @@ V1_SUPPORTED_MODELS = [
FULL_CUDA_GRAPH_MODELS = [
"ai21labs/Jamba-tiny-dev",
"pfnet/plamo-2-1b",
"Zyphra/Zamba2-1.2B-instruct",
]
......@@ -65,6 +55,11 @@ V0_UNSUPPORTED_MODELS = [
"LiquidAI/LFM2-1.2B",
]
FP32_STATE_MODELS = [
"state-spaces/mamba-130m-hf",
"Zyphra/Zamba2-1.2B-instruct",
]
# Avoid OOM
MAX_NUM_SEQS = 4
......@@ -85,20 +80,13 @@ def test_models(
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
hf_version_check = model_info.check_transformers_version(
on_fail="return")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
hf_version_check = None
if hf_version_check is not None:
print(f"Skipping transformers comparison because: {hf_version_check}")
pass
with hf_runner(model) as hf_model:
if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
else:
hf_outputs = None
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
......@@ -116,7 +104,7 @@ def test_models(
else:
vllm_v1_outputs = None
if hf_outputs is not None and vllm_v0_outputs is not None:
if vllm_v0_outputs is not None:
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v0_outputs,
......@@ -125,12 +113,10 @@ def test_models(
)
if model in V1_SUPPORTED_MODELS:
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
assert ref_outputs is not None
check_logprobs_close(
outputs_0_lst=ref_outputs,
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v1_outputs,
name_0="hf" if hf_outputs is not None else "vllm-v0",
name_0="hf",
name_1="vllm-v1",
)
......@@ -315,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
finished_requests_ids is larger than the maximum mamba block capacity.
This could generally happen due to the fact that hybrid does support
statelessness mechanism where it can cleanup new incoming requests in
statelessness mechanism where it can clean up new incoming requests in
a single step.
"""
try:
......@@ -336,7 +322,7 @@ def test_state_cleanup(
This test is for verifying that the Hybrid state is cleaned up between
steps.
If its not cleaned, an error would be expected.
If it's not cleaned, an error would be expected.
"""
try:
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
......@@ -397,11 +383,8 @@ def test_full_cuda_graph(
pass
with hf_runner(model) as hf_model:
if model not in HF_UNSUPPORTED_MODELS:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
else:
hf_outputs = None
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
......@@ -416,7 +399,7 @@ def test_full_cuda_graph(
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
if hf_outputs is not None and vllm_v0_outputs is not None:
if vllm_v0_outputs is not None:
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v0_outputs,
......@@ -424,17 +407,15 @@ def test_full_cuda_graph(
name_1="vllm-v0",
)
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
assert ref_outputs is not None
check_logprobs_close(
outputs_0_lst=ref_outputs,
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v1_outputs,
name_0="hf" if hf_outputs is not None else "vllm-v0",
name_0="hf",
name_1="vllm-v1",
)
@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_fp32_state(
......@@ -455,11 +436,8 @@ def test_fp32_state(
pass
with hf_runner(model) as hf_model:
if model not in HF_UNSUPPORTED_MODELS:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
else:
hf_outputs = None
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
......@@ -475,18 +453,16 @@ def test_fp32_state(
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
if hf_outputs is not None:
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v0_outputs,
name_0="hf",
name_1="vllm-v0",
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v0_outputs,
name_0="hf",
name_1="vllm-v0",
)
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
check_logprobs_close(
outputs_0_lst=ref_outputs,
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_v1_outputs,
name_0="hf" if hf_outputs is not None else "vllm-v0",
name_0="hf",
name_1="vllm-v1",
)
......@@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3",
# uses the v3-Tekken tokenizer
"mistralai/Ministral-8B-Instruct-2410",
# Mistral-Nemo is to big for CI, but passes locally
# Mistral-Nemo is too big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407"
]
......@@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
def test_mistral_function_call_nested_json():
"""Ensure that the function-name regex captures the entire outer-most
"""Ensure that the function-name regex captures the entire outermost
JSON block, including nested braces."""
# Create a minimal stub tokenizer that provides the few attributes the
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/docs/transformers/perplexity
from typing import Optional, cast
import pytest
import torch
from datasets import load_dataset
from tests.models.utils import (GenerateModelInfo,
TokensTextLogprobsPromptLogprobs)
from vllm.logprobs import Logprob
# See #24485
PPL_TOL = 0.01
MAX_LENGTH = 1024
@torch.inference_mode
def wikitext_ppl_test(hf_runner,
vllm_runner,
model_info: GenerateModelInfo,
max_length=MAX_LENGTH,
vllm_extra_kwargs=None,
atol=PPL_TOL):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not model_info.enable_test:
pytest.skip("Skipping test.")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = model_info.dtype
# Allow vllm to test using hf_overrides
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
with vllm_runner(model_info.name,
gpu_memory_utilization=0.7,
max_model_len=max_length,
max_num_seqs=1,
enforce_eager=True,
**vllm_extra_kwargs) as vllm_model:
# Use max_num_seqs=1 to avoid OOM,
# and batch different requests together.
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
max_length = min(model_config.max_model_len - 1, max_length)
stride = max_length
tokenizer = vllm_model.llm.get_tokenizer()
tokens = tokenizer.encode("\n\n".join(dataset["text"]))
n_tokens = len(tokens)
chunks = []
for begin_loc in range(0, n_tokens, stride):
end_loc = min(begin_loc + max_length, n_tokens)
chunks.append(tokens[begin_loc:end_loc])
outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
max_tokens=1,
num_logprobs=None,
num_prompt_logprobs=0,
use_tqdm=False)
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
n_tokens = 0
for output in outputs:
output = cast(TokensTextLogprobsPromptLogprobs, output)
token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
assert token_datas[0] is None
token_log_probs = []
for token_data in token_datas[1:]:
assert token_data is not None
assert len(token_data) == 1
token_log_prob = list(token_data.values())[0].logprob
token_log_probs.append(token_log_prob)
neg_log_likelihood = -torch.tensor(
token_log_probs, dtype=torch.float32, device="cpu").sum()
nll_sum += neg_log_likelihood
n_tokens += len(token_log_probs)
vllm_ppl = float(torch.exp(nll_sum / n_tokens))
vllm_dtype = model_config.dtype
# Accelerate ppl test by setting Transformers ppl score to a constant
if model_info.hf_ppl is None:
with hf_runner(
model_info.name,
dtype=model_info.hf_dtype,
) as hf_model:
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
n_tokens = 0
for chunk in chunks:
inputs = hf_model.wrap_device(
{"input_ids": torch.tensor([chunk])})
input_ids = inputs["input_ids"]
outputs = hf_model.model(input_ids, labels=input_ids)
neg_log_likelihood = outputs.loss
neg_log_likelihood = neg_log_likelihood.to(torch.float32).cpu()
num_loss_tokens = len(chunk) - 1
nll_sum += neg_log_likelihood * num_loss_tokens
n_tokens += num_loss_tokens
hf_ppl = float(torch.exp(nll_sum / n_tokens))
hf_dtype = next(hf_model.model.parameters()).dtype
else:
hf_ppl = model_info.hf_ppl
hf_dtype = "Constant"
differ = (vllm_ppl - hf_ppl) / hf_ppl
print("Model:", model_info.name)
print("VLLM:", vllm_dtype, vllm_ppl)
print("Transformers:", hf_dtype, hf_ppl)
print("Difference (%):", differ * 100)
# PPL the smaller, the better
# We are not concerned that the vllm PPL is less than Transformers,
# so we only perform one-sided testing.
assert differ < atol
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import GenerateModelInfo
from .ppl_utils import wikitext_ppl_test
MODELS = [
GenerateModelInfo("google/gemma-2b"),
GenerateModelInfo("google/gemma-2-2b"),
GenerateModelInfo("google/gemma-3-4b-it"),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
wikitext_ppl_test(hf_runner, vllm_runner, model_info)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment