Commit 99324e25 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.2' into v0.9.2-ori

parents cc7f22a8 a5dd03c1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
# Register CustomRotaryEmbedding to CustomOP.
@RotaryEmbedding.register_oot
class DummyRotaryEmbedding(RotaryEmbedding):
"""Original rotary positional embedding."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.addition_config = True
def forward_oot(self, *args,
**kwargs) -> tuple[torch.Tensor, torch.Tensor]:
return super().forward_oot(*args, **kwargs)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING
from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.interface import Platform, PlatformEnum
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None
from vllm import envs
class DummyPlatform(CudaPlatform):
class DummyPlatform(Platform):
_enum = PlatformEnum.OOT
device_name = "DummyDevice"
device_type: str = "privateuseone"
dispatch_key: str = "PrivateUse1"
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1:
compilation_config = vllm_config.compilation_config
# Activate custom ops for v1.
compilation_config.custom_ops = ["all"]
def get_attn_backend_cls(self, backend_name, head_size, dtype,
kv_cache_dtype, block_size, use_v1, use_mla):
return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501
return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501
\ No newline at end of file
......@@ -5,6 +5,7 @@ import pytest
import torch
from vllm.attention.selector import get_attn_backend
from vllm.plugins import load_general_plugins
from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
......@@ -32,3 +33,16 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
backend = get_attn_backend(16, torch.float16, "auto", 16, False)
assert backend.get_name() == "Dummy_Backend"
def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
# simulate workload by running an example
load_general_plugins()
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
"possibly because the custom op is not registered correctly.")
assert hasattr(layer, "addition_config"), (
"Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
"which is set by the custom op.")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
import os
import traceback
from typing import Callable
import torch
from torch.multiprocessing import (
spawn) # pyright: ignore[reportPrivateImportUsage]
from typing_extensions import Concatenate, ParamSpec
P = ParamSpec("P")
@dataclasses.dataclass
class ProcessGroupInfo:
world_size: int
world_local_size: int
rank: int
node_rank: int
local_rank: int
device: torch.device
def _worker_parallel_launch(
local_rank: int,
world_size: int,
world_local_size: int,
node_rank: int,
init_method: str,
worker: Callable[Concatenate[ProcessGroupInfo, P], None],
*args: P.args,
**kwargs: P.kwargs,
) -> None:
rank = node_rank * world_local_size + local_rank
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
torch.distributed.init_process_group(
backend="cpu:gloo,cuda:nccl",
init_method=init_method,
rank=rank,
world_size=world_size,
device_id=device,
)
barrier = torch.tensor([rank], device=device)
torch.distributed.all_reduce(barrier)
try:
worker(
ProcessGroupInfo(
world_size=world_size,
world_local_size=world_local_size,
rank=rank,
node_rank=node_rank,
local_rank=local_rank,
device=device,
),
*args,
**kwargs,
)
except Exception as ex:
print(ex)
traceback.print_exc()
raise
finally:
torch.distributed.destroy_process_group()
def parallel_launch(
world_size: int,
worker: Callable[Concatenate[ProcessGroupInfo, P], None],
*args: P.args,
**kwargs: P.kwargs,
) -> None:
assert not kwargs
spawn(
_worker_parallel_launch,
args=(
world_size,
world_size,
0,
"tcp://localhost:29500",
worker,
) + args,
nprocs=world_size,
join=True,
)
def parallel_launch_from_env(
worker: Callable[Concatenate[ProcessGroupInfo, P], None],
*args: P.args,
**kwargs: P.kwargs,
) -> None:
"""
Launches a worker function in parallel across all processes in the current
environment. The environment must have the following variables set:
- WORLD_SIZE: The total number of processes.
- WORLD_LOCAL_SIZE: The number of processes on the current node.
- NODE_RANK: The rank of the current
- MASTER_ADDR: The address of the master process.
- MASTER_PORT: The port of the master process.
"""
assert not kwargs
world_size = int(os.environ["WORLD_SIZE"])
world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
node_rank = int(os.environ["NODE_RANK"])
assert "MASTER_ADDR" in os.environ
assert "MASTER_PORT" in os.environ
spawn(
_worker_parallel_launch,
args=(
world_size,
world_local_size,
node_rank,
"env://",
worker,
) + args,
nprocs=world_local_size,
join=True,
)
......@@ -159,8 +159,9 @@ def test_4bit_bnb_embedding_model(
with vllm_runner(model_name,
task="embed",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes") as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
vllm_outputs = vllm_model.embed(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
......
......@@ -17,7 +17,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
CompressedTensorsWNA16)
CompressedTensorsWNA16, cutlass_fp4_supported)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported)
from vllm.platforms import current_platform
......@@ -667,7 +667,13 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, scheme)
if isinstance(qkv_proj.scheme, scheme) or isinstance(
qkv_proj.scheme,
CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
assert True
else:
raise AssertionError("FP4 Scheme Mismatch")
assert qkv_proj.scheme.group_size == 16
llm.apply_model(check_model)
......
......@@ -53,6 +53,7 @@ class CustomQuantConfig(QuantizationConfig):
def __init__(self, num_bits: int = 8) -> None:
"""Initialize the quantization config."""
super().__init__()
self.num_bits = num_bits
def get_name(self) -> QuantizationMethods:
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright © 2025, Oracle and/or its affiliates.
"""Tests RTN quantization startup and generation,
doesn't test correctness
"""
import pytest
from tests.quantization.utils import is_quant_method_supported
MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
@pytest.mark.skipif(not is_quant_method_supported("rtn"),
reason="RTN is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
def test_model_rtn_startup(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
......@@ -60,5 +60,20 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
print(output)
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
torch._dynamo.reset()
model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0") as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
assert output
print(output)
if __name__ == "__main__":
pytest.main([__file__])
......@@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
size=(batch_size, 1),
dtype=torch.int64)
# The target probaility distribution is a temperature zero distribution
# with zero entroy. Since our draft token ids don't match the probability
# with zero entropy. Since our draft token ids don't match the probability
# 1.0 tokens in the target distribution we will reject all of them and
# fallback to the greedy sampling for selecting 1 token for each sequence.
# Verify the same.
......
......@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, EAGLE would not break the
correctess for the target model outputs.
correctness for the target model outputs.
"""
import pytest
......@@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override": 2 + 256 // 16,
"max_model_len": (2 + 256 // 16) * 16,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override": 2 + 256 // 16,
"max_model_len": (2 + 256 // 16) * 16,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......
......@@ -14,10 +14,13 @@ MAIN_MODEL = "JackFram/llama-68m"
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
# Verify equality when cuda graphs allowed.
"enforce_eager": False,
"model_name": "JackFram/llama-68m",
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
......@@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
@pytest.mark.parametrize(
......@@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
......@@ -17,7 +17,10 @@ from .conftest import run_equality_correctness_test
"model_name": "JackFram/llama-160m",
# Skip cuda graph recording for fast test.
"enforce_eager": True
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
......@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, Medusa would not break the
correctess for the target model outputs.
correctness for the target model outputs.
"""
import pytest
......
......@@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
......@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
correctness for the target model outputs.
"""
import pytest
......
......@@ -57,6 +57,9 @@ from .conftest import (get_output_from_llm_generator,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
......@@ -139,6 +142,9 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
# Print spec metrics.
"disable_log_stats": False,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
......@@ -216,6 +222,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
# Print spec metrics.
"disable_log_stats": False,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
......@@ -279,6 +288,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
......@@ -464,6 +476,8 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
......@@ -523,6 +537,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
......@@ -589,6 +605,8 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -655,6 +673,8 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -706,6 +726,8 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -763,6 +785,8 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
......@@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under preemption
* Test greedy equality under various ngram sizes / speculative sizes
With those tests, we can say at least, ngram spec would not break the correctess
for the target model outputs.
With those tests, we can say at least, ngram spec would not break the
correctness for the target model outputs.
"""
import pytest
......@@ -40,6 +40,9 @@ from .conftest import run_equality_correctness_test
# Print spec metrics.
"disable_log_stats": False,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
......@@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Print spec metrics.
"disable_log_stats": False,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
......@@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
......@@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
#!/bin/sh
# This script tests if the nightly torch packages are not overridden by the dependencies
set -e
set -x
cd /vllm-workspace/
rm -rf .venv
uv venv .venv
source .venv/bin/activate
# check the environment
uv pip freeze
echo ">>> Installing nightly torch packages"
uv pip install --quiet torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu128
echo ">>> Capturing torch-related versions before requirements install"
uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > before.txt
echo "Before:"
cat before.txt
echo ">>> Installing requirements/nightly_torch_test.txt"
uv pip install --quiet -r requirements/nightly_torch_test.txt
echo ">>> Capturing torch-related versions after requirements install"
uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > after.txt
echo "After:"
cat after.txt
echo ">>> Comparing versions"
if diff before.txt after.txt; then
echo "torch version not overridden."
else
echo "torch version overridden by nightly_torch_test.txt, \
if the dependency is not triggered by the pytroch nightly test,\
please add the dependency to the list 'white_list' in tools/generate_nightly_torch_test.py"
exit 1
fi
......@@ -2,49 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import MISSING, Field, asdict, dataclass, field
from typing import Literal, Union
import pytest
from vllm.compilation.backends import VllmBackend
from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
config, get_field)
get_field)
from vllm.model_executor.layers.pooler import PoolingType
from vllm.platforms import current_platform
class TestConfig1:
pass
@dataclass
class TestConfig2:
a: int
"""docstring"""
@dataclass
class TestConfig3:
a: int = 1
@dataclass
class TestConfig4:
a: Union[Literal[1], Literal[2]] = 1
"""docstring"""
@pytest.mark.parametrize(("test_config", "expected_error"), [
(TestConfig1, "must be a dataclass"),
(TestConfig2, "must have a default"),
(TestConfig3, "must have a docstring"),
(TestConfig4, "must use a single Literal"),
])
def test_config(test_config, expected_error):
with pytest.raises(Exception, match=expected_error):
config(test_config)
def test_compile_config_repr_succeeds():
# setup: VllmBackend mutates the config object
config = VllmConfig()
......@@ -57,23 +24,23 @@ def test_compile_config_repr_succeeds():
assert 'inductor_passes' in val
def test_get_field():
@dataclass
class _TestConfigFields:
a: int
b: dict = field(default_factory=dict)
c: str = "default"
@dataclass
class TestConfig:
a: int
b: dict = field(default_factory=dict)
c: str = "default"
def test_get_field():
with pytest.raises(ValueError):
get_field(TestConfig, "a")
get_field(_TestConfigFields, "a")
b = get_field(TestConfig, "b")
b = get_field(_TestConfigFields, "b")
assert isinstance(b, Field)
assert b.default is MISSING
assert b.default_factory is dict
c = get_field(TestConfig, "c")
c = get_field(_TestConfigFields, "c")
assert isinstance(c, Field)
assert c.default == "default"
assert c.default_factory is MISSING
......@@ -85,7 +52,7 @@ def test_get_field():
("distilbert/distilgpt2", "generate", "generate"),
("intfloat/multilingual-e5-small", "pooling", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
("openai/whisper-small", "transcription", "transcription"),
],
......@@ -105,6 +72,32 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
assert config.task == expected_task
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"),
[
("distilbert/distilgpt2", "pooling", "embed"),
("intfloat/multilingual-e5-small", "pooling", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
("openai/whisper-small", "pooling", "embed"),
],
)
def test_score_task(model_id, expected_runner_type, expected_task):
config = ModelConfig(
model_id,
task="score",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
assert config.runner_type == expected_runner_type
assert config.task == expected_task
@pytest.mark.parametrize(("model_id", "bad_task"), [
("Qwen/Qwen2.5-Math-RM-72B", "generate"),
])
......@@ -438,3 +431,33 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
config = VllmConfig(load_config=load_config)
assert config.load_config.pt_load_map_location == pt_load_map_location
@pytest.mark.parametrize(
("model_id", "max_model_len", "expected_max_len", "should_raise"), [
("BAAI/bge-reranker-base", None, 512, False),
("BAAI/bge-reranker-base", 256, 256, False),
("BAAI/bge-reranker-base", 513, 512, True),
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", None, 131072, False),
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
])
def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
should_raise):
"""Test get_and_verify_max_len with different configurations."""
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
if should_raise:
with pytest.raises(ValueError):
model_config.get_and_verify_max_len(max_model_len)
else:
actual_max_len = model_config.get_and_verify_max_len(max_model_len)
assert actual_max_len == expected_max_len
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment