Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction, run_tool_extraction_streaming)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+
+# Test cases similar to pythonic parser but with Llama4 specific format
+SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "LA", "metric": "C"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', "
+                              "age=9, "
+                              "address={'city': 'LA', 'state': 'CA'}, "
+                              "role=None, "
+                              "passed_test=True, "
+                              "aliases=['John', 'Johnny'])]")
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "Doe", '
+    '"age": 9, '
+    '"address": {"city": "LA", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{}',
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]")
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+PYTHON_TAG_FUNCTION_OUTPUT = (
+    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>")
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+test_str = "<|python_start|>"
+test_str += "[get_weather(city='LA', metric='C'),"
+test_str += "register_user(name='Doe', age=9)]"
+TEST_CASES = [
+    pytest.param(True,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="simple_streaming"),
+    pytest.param(False,
+                 SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="simple_nonstreaming"),
+    pytest.param(True,
+                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_streaming"),
+    pytest.param(False,
+                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_nonstreaming"),
+    pytest.param(True,
+                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_streaming"),
+    pytest.param(False,
+                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_nonstreaming"),
+    pytest.param(True,
+                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_streaming"),
+    pytest.param(False,
+                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_nonstreaming"),
+    pytest.param(True,
+                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_streaming"),
+    pytest.param(False,
+                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_nonstreaming"),
+    pytest.param(True,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_streaming"),
+    pytest.param(False,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_nonstreaming"),
+    pytest.param(
+        True,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user",
+                         arguments='{"name": "Doe", "age": 9}')
+        ],
+        id="parallel_calls_streaming"),
+    pytest.param(
+        False,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user",
+                         arguments='{"name": "Doe", "age": 9}')
+        ],
+        id="parallel_calls_nonstreaming"),
+    pytest.param(True,
+                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="python_tag_streaming"),
+    pytest.param(False,
+                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="python_tag_nonstreaming"),
+    pytest.param(True,
+                 test_str, [
+                     SIMPLE_FUNCTION_CALL,
+                     FunctionCall(name="register_user",
+                                  arguments='{"name": "Doe", "age": 9}')
+                 ],
+                 id="parallel_calls_streaming"),
+    pytest.param(False,
+                 "<|python_start|>[get_weather(city='LA', metric='C'), " +
+                 "register_user(name='Doe', age=9)]", [
+                     SIMPLE_FUNCTION_CALL,
+                     FunctionCall(name="register_user",
+                                  arguments='{"name": "Doe", "age": 9}')
+                 ],
+                 id="parallel_calls_nonstreaming"),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
+                         TEST_CASES)
+def test_tool_call(streaming: bool, model_output: str,
+                   expected_tool_calls: list[FunctionCall]):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps():
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    model_output_deltas = [
+        "<|python_start|>[get_weather(city='LA', metric='C'), "
+        "get_weather(), "
+        "do_something_cool(steps=[])]<|python_end|>",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -148,6 +148,11 @@ def test_paged_attention(
            or (version == "rocm" and head_size not in (64, 128))):
        pytest.skip()

+    if (version == "rocm" and current_platform.is_navi()
+            and (kv_cache_dtype == "fp8" or head_size != 128
+                 or block_size != 16 or use_alibi)):
+        pytest.skip()
+
    global PARTITION_SIZE

    current_platform.seed_everything(seed)
@@ -275,6 +280,7 @@ def test_paged_attention(
                scale,
                block_tables,
                seq_lens,
+                None,
                block_size,
                max_seq_len,
                alibi_slopes,
@@ -286,7 +292,7 @@ def test_paged_attention(
            opcheck(torch.ops._rocm_C.paged_attention,
                    (output, exp_sums, max_logits, tmp_output, query,
                     key_cache, value_cache, num_kv_heads, scale, block_tables,
-                     seq_lens, block_size, max_seq_len, alibi_slopes,
+                     seq_lens, None, block_size, max_seq_len, alibi_slopes,
                     kv_cache_dtype, k_scale, v_scale),
                    cond=(head_size == HEAD_SIZES[0]
                          and block_size == BLOCK_SIZES[0]))

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -575,3 +575,21 @@ def test_moe_align_block_size_opcheck():
    opcheck(torch.ops._moe_C.moe_align_block_size,
            (topk_ids, num_experts, block_size, sorted_ids, expert_ids,
             num_tokens_post_pad))
+
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
+    input = torch.randn((m, topk, k), device="cuda", dtype=dtype)
+    actual = torch.empty((m, k), device="cuda", dtype=dtype)
+
+    expected = input.sum(dim=1)
+    torch.ops._moe_C.moe_sum(input, actual)
+
+    torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)
+
+    opcheck(torch.ops._moe_C.moe_sum, (input, actual))
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -13,7 +13,7 @@ import torch
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    moe_permute, moe_unpermute)
+    moe_permute, moe_permute_unpermute_supported, moe_unpermute)
 from vllm.platforms import current_platform

 NUM_EXPERTS = [16, 64]
@@ -167,6 +167,8 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
 def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
                               n_expert: int, ep_size: int, dtype: torch.dtype,
                               align_block_size: Optional[int]):
+    if not moe_permute_unpermute_supported():
+        pytest.skip("moe_permute_unpermute is not supported on this platform.")
    fill_invalid_expert = 0
    ep_rank = np.random.randint(0, ep_size)
    expert_map = None

--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192

 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
-NUM_TOKENS = [7, 83, 2048]
+NUM_TOKENS = [7, 2050]
 D = [512, 4096, 5120, 13824]
-GROUP_SIZE = [64, 128, 256, 512]
-M = [1, 7, 8, 83, 84, 512, 2048, 4096]
-N = [128, 512, 1024, 4096, 7168, 7748, 13824]
-K = [256, 4096, 5120, 3884, 13824, 16384]
+GROUP_SIZE = [64, 128, 512]
+M = [1, 7, 8, 83, 84, 4096]
+N = [128, 512, 7168, 7748, 13824]
+K = [256, 3884, 4096, 13824, 16384]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
-M_moe = [1, 2, 7, 83, 128, 512, 2048]
-M_moe_dg = [128, 192, 512, 1335, 2048]
+M_moe = [1, 2, 7, 83, 128, 2048]
+M_moe_dg = [128, 192, 1335, 2048]
 N_moe = [128, 256, 1024, 4608]  # [13824]
 K_moe = [256, 512, 7168]  # [13824]
 BLOCK_SIZE = [[128, 128]]

--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -8,7 +8,6 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
 from huggingface_hub import snapshot_download

 import vllm._custom_ops as ops
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
 from vllm.platforms import current_platform
@@ -35,11 +34,11 @@ def get_gguf_MoE_tensors(
    return GGUFReader(sample_file).tensors


-DTYPES = [torch.half, torch.bfloat16, torch.float32]
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
 # Hidden_size for testing, must match the sample file in HF repo,
 # we have `hidden_size = 256, 1024` for test in HF repo currently.
 HIDDEN_SIZES = [256, 1024]
-NUM_TOKENS = [7, 83, 128, 2048]  # Arbitrary values for testing
+NUM_TOKENS = [7, 2050]  # Arbitrary values for testing
 SEEDS = [0]
 QUANT_TYPES = [
    # i-matrix
@@ -176,12 +175,11 @@ def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,

    w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
                              device="cuda").to(dtype)
-    act = SiluAndMul()

    output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
                             torch.tensor(w2.data,
                                          device="cuda"), topk_weights,
-                             topk_ids, quant_type, quant_type, act)
+                             topk_ids, quant_type, quant_type, "silu")

    ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
                               topk_ids).reshape(output.shape)

--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -13,8 +13,13 @@ from vllm.platforms import current_platform

 device = "cuda"

+triton_scaled_mm_module = importlib.import_module(
+    "vllm.model_executor.layers.quantization.compressed_tensors."
+    "triton_scaled_mm")
+triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm

-def scaled_mm_torch(a: torch.Tensor,
+
+def torch_scaled_mm(a: torch.Tensor,
                    b: torch.Tensor,
                    scale_a: torch.Tensor,
                    scale_b: torch.Tensor,
@@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
    if use_bias:
        bias = torch.rand((N, ), device=device, dtype=out_dtype)

-    triton_scaled_mm_module = importlib.import_module(
-        "vllm.model_executor.layers.quantization.compressed_tensors."
-        "triton_scaled_mm")
-    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
-
    c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

-    a_cpu = a.cpu()
-    b_cpu = b.cpu()
-    scale_a_cpu = scale_a.cpu()
-    scale_b_cpu = scale_b.cpu()
-    bias_cpu = None if bias is None else bias.cpu()
-
-    c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
-                               out_dtype, bias_cpu)
+    c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

-    c_check_cpu = c_check.cpu()
-    torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
+    torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
 # SPDX-License-Identifier: Apache-2.0
+import subprocess
+import sys
+from typing import Union

 import pytest
 import ray

 import vllm
+from vllm import LLM
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig

-from ..utils import create_new_process_for_each_test, multi_gpu_test
+from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test

 MODEL_PATH = "meta-llama/Llama-2-7b-hf"

@@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora):
    pass


-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+def do_sample(llm: vllm.LLM,
+              lora_path: str,
+              lora_id: int,
+              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
    prompts = [
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
@@ -45,15 +53,28 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
    ]
+
    sampling_params = vllm.SamplingParams(temperature=0,
                                          max_tokens=256,
                                          skip_special_tokens=False,
                                          stop=["[/assistant]"])
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+
+    if tensorizer_config_dict is not None:
+        outputs = llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest(
+                str(lora_id),
+                lora_id,
+                lora_path,
+                tensorizer_config_dict=tensorizer_config_dict)
+            if lora_id else None)
+    else:
+        outputs = llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+            if lora_id else None)
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
@@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    return generated_texts


-def generate_and_test(llm, sql_lora_files):
+def generate_and_test(llm,
+                      sql_lora_files,
+                      tensorizer_config_dict: Union[dict, None] = None):
    print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=1) == EXPECTED_LORA_OUTPUT

    print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=2) == EXPECTED_LORA_OUTPUT

    print("removing lora")

@@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)
+
+
+@multi_gpu_test(num_gpus=2)
+@create_new_process_for_each_test()
+def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
+                                            sql_lora_huggingface_id):
+
+    # Run the tensorizing of the LoRA adapter and the model in a subprocess
+    # to guarantee cleanup
+
+    tp_size = 2
+    model_name = "model-rank-%03d.tensors"
+
+    model_ref = MODEL_PATH
+    lora_path = sql_lora_huggingface_id
+    suffix = "test"
+    try:
+        result = subprocess.run([
+            sys.executable,
+            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
+            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
+            str(tp_size), "serialize", "--serialized-directory",
+            str(tmp_path), "--suffix", suffix
+        ],
+                                check=True,
+                                capture_output=True,
+                                text=True)
+    except subprocess.CalledProcessError as e:
+        print("Tensorizing failed.")
+        print("STDOUT:\n", e.stdout)
+        print("STDERR:\n", e.stderr)
+        raise
+
+    print("STDOUT:\n", result.stdout)
+
+    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
+    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
+    tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+
+    loaded_vllm_model = LLM(model=model_ref,
+                            load_format="tensorizer",
+                            enable_lora=True,
+                            enforce_eager=True,
+                            model_loader_extra_config=tensorizer_config,
+                            max_num_seqs=13,
+                            tensor_parallel_size=2,
+                            max_loras=2)
+
+    tensorizer_config_dict = tensorizer_config.to_dict()
+
+    print("lora adapter created")
+    assert do_sample(loaded_vllm_model,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+
+    print("lora 1")
+    assert do_sample(loaded_vllm_model,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=1) == EXPECTED_LORA_OUTPUT
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -69,7 +69,7 @@ def test_lora_functions_sync():
    run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
    run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])

-    # Remove all LoRAs
+    # Remove all LoRAs.
    run_check(llm.remove_lora, 13, [12, 10, 11])
    run_check(llm.remove_lora, 12, [10, 11])
    run_check(llm.remove_lora, 11, [10])

--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -31,7 +31,7 @@ HYBRID_MODELS = [
    # not compatible with pip-compile.
    "pfnet/plamo-2-1b",
    "Zyphra/Zamba2-1.2B-instruct",
-    "hmellor/bamba-tiny-random",
+    "hmellor/tiny-random-BambaForCausalLM",
 ]

 # Avoid OOM

--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
 # SPDX-License-Identifier: Apache-2.0
-import math
 from collections.abc import Sequence

 import mteb
@@ -115,4 +114,4 @@ def mteb_test_embed_models(hf_runner,
    print("SentenceTransformer:", model_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

-    assert math.isclose(st_main_score, vllm_main_score, rel_tol=MTEB_EMBED_TOL)
+    assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL)
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -15,13 +15,12 @@ from ...utils import check_embeddings_close
                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
        pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
        # [Decoder-only]
        pytest.param("BAAI/bge-multilingual-gemma2",
                     marks=[pytest.mark.core_model]),
        pytest.param("intfloat/e5-mistral-7b-instruct",
                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
        # [Cross-Encoder]
        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
@@ -47,9 +46,6 @@ def test_models(
        vllm_extra_kwargs["override_pooler_config"] = \
            PoolerConfig(pooling_type="MEAN")

-    if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
-
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:

--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -2,7 +2,6 @@
 from __future__ import annotations

 import importlib.util
-import math
 from array import array

 import openai
@@ -104,16 +103,16 @@ def get_test_data():

 def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
-    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
+    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001)

    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
-    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
+    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001)

    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
-    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
+    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001)

    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
+    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001)


 def test_gritlm_offline_embedding(vllm_runner):

--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -45,9 +45,6 @@ MODELS = [
    EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                   architecture="Qwen2ForCausalLM",
                   enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-Qwen2-7B-instruct",
-                   architecture="Qwen2ForCausalLM",
-                   enable_test=False),
    ########## ModernBertModel
    EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
                   architecture="ModernBertModel",
@@ -58,14 +55,9 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_models_mteb(hf_runner, vllm_runner,
                     model_info: EmbedModelInfo) -> None:
-    pytest.skip("Skipping mteb test.")
-
    from .mteb_utils import mteb_test_embed_models

    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
-
    if model_info.architecture == "GteNewModel":
        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}

@@ -83,9 +75,6 @@ def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
    example_prompts = [str(s).strip() for s in example_prompts]

    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
-
    if model_info.architecture == "GteNewModel":
        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}


--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
 # SPDX-License-Identifier: Apache-2.0
-import math
-
 import pytest

 from vllm import PoolingParams
@@ -60,7 +58,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
    assert len(vllm_outputs) == 1
    assert len(hf_outputs) == 1

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)


 @pytest.mark.parametrize("dtype", ["half"])
@@ -78,8 +76,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
    assert len(vllm_outputs) == 10
    assert len(hf_outputs) == 10

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)


 @pytest.fixture(scope="module", params=EMBEDDING_MODELS)

--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -23,7 +23,6 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_models_mteb(hf_runner, vllm_runner,
                     model_info: EmbedModelInfo) -> None:
-    pytest.skip("Skipping mteb test.")
    from .mteb_utils import mteb_test_embed_models
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
 # SPDX-License-Identifier: Apache-2.0
-import math
-
 import pytest
 import torch
 import torch.nn.functional as F
@@ -45,7 +43,7 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
    assert len(vllm_outputs) == 1
    assert len(hf_outputs) == 1

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)


 def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
@@ -64,8 +62,8 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
    assert len(vllm_outputs) == 2
    assert len(hf_outputs) == 2

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)


 def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
@@ -84,8 +82,8 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
    assert len(vllm_outputs) == 2
    assert len(hf_outputs) == 2

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)


 @pytest.fixture(scope="module", params=EMBEDDING_MODELS)
@@ -112,7 +110,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
    assert len(vllm_outputs) == 1
    assert len(hf_outputs) == 1

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)


 def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
@@ -140,8 +138,8 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
    assert len(vllm_outputs) == 2
    assert len(hf_outputs) == 2

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)


 def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
@@ -169,5 +167,5 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
    assert len(vllm_outputs) == 2
    assert len(hf_outputs) == 2

-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -46,7 +46,6 @@ def test_models_mteb(
    vllm_runner,
    model_info: EmbedModelInfo,
 ) -> None:
-    pytest.skip("Skipping mteb test.")
    from .mteb_utils import mteb_test_embed_models
    mteb_test_embed_models(hf_runner, vllm_runner, model_info)


--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -349,6 +349,17 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
    ),
+    "intern_vl-video": VLMTestInfo(
+        models=[
+            "OpenGVLab/InternVL3-1B",
+        ],
+        test_type=VLMTestType.VIDEO,
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<video>",
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+    ),
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -4,6 +4,7 @@ import pytest

 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode

 models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]

@@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
    give the same result.
    """

-    image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-    image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
+    image_cherry = convert_image_mode(
+        ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
    images = [image_cherry, image_stop]
    video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays