[tests]skip tpu and weight_loading tests, fix tests of worker

5e078c69 · zhuwenwen · ced28510 · 5e078c69 · 5e078c69 · 5e078c69
Commit 5e078c69 authored Jun 03, 2025 by zhuwenwen
20 changed files
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
        {
            # Speculative max model len > overridden max model len should raise.
            "speculative_config": {
-                "model": "JackFram/llama-68m",
+                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
                "num_speculative_tokens": 5,
                "max_model_len": 129,
            },
@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
            # Speculative max model len > draft max model len should raise.
            # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
            "speculative_config": {
-                "model": "JackFram/llama-68m",
+                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
                "num_speculative_tokens": 5,
                "max_model_len": 2048 + 1,
            },
@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
            # Speculative max model len > target max model len should raise.
            # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
            "speculative_config": {
-                "model": "JackFram/llama-68m",
+                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
                "num_speculative_tokens": 5,
                "max_model_len": 131072 + 1,
            },

--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
        "dtype": "float16",

        # Main model
-        "model_name": "meta-llama/Llama-2-7b-chat-hf",
+        "model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
        "speculative_config": {
-            "model": "yuhuili/EAGLE-llama2-chat-7B",
+            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"),
            "num_speculative_tokens": MAX_SPEC_TOKENS,
        },
    },
@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
        "dtype": "float16",

        # Main model
-        "model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
        "speculative_config": {
-            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
            "num_speculative_tokens": MAX_SPEC_TOKENS,
        },
    },
@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
        "dtype": "float16",

        # Main model
-        "model_name": "Qwen/Qwen2-7B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
        "speculative_config": {
-            "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-Qwen2-7B-Instruct"),
            "num_speculative_tokens": MAX_SPEC_TOKENS,
        },
    },

--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
 correctess for the target model outputs.
 """

+import os
 import pytest

 from .conftest import run_equality_correctness_test
+from ...utils import models_path_prefix

 # main model
-MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
+MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random")

 # max. number of speculative tokens: this corresponds to
 # num_nextn_predict_layers in the config.json of the speculator model.

--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model_name": "JackFram/llama-68m",
+        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
    assert (num_mismatch > 0)


-@torch.inference_mode()
-@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
-# The choice of backends forces the multi_step_worker to choose between
-# the vanilla model_runner and TP1DraftModelRunner and that we can test
-# both code paths.
-@pytest.mark.parametrize('attn_backend',
-                         [_Backend.XFORMERS, _Backend.FLASH_ATTN])
-def test_multi_step_correct_kvcache(num_steps, attn_backend):
-    """Verify that the KV cache of the draft model 
-    is correctly updated for sequences with bonus token.
-    """
-    seed = 100
-    model_name = "JackFram/llama-68m"
-
-    block_size = 16
-    num_gpu_blocks = 2048 // block_size
-    batch_size = 1
-
-    with global_force_attn_backend_context_manager(attn_backend):
-        dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
-        multi_step_worker = create_worker(MultiStepWorker,
-                                          model_name,
-                                          block_size,
-                                          num_gpu_blocks,
-                                          seed,
-                                          model_runner_cls=TP1DraftModelRunner,
-                                          dtype=dtype)
-        multi_step_worker.set_include_gpu_probs_tensor()
-        worker = create_worker(Worker,
-                               model_name,
-                               block_size,
-                               num_gpu_blocks,
-                               seed,
-                               dtype=dtype)
-
-        prompts = [[0] for _ in range(batch_size)]
-        # Already generate two tokens for the sequence
-        # so that we can simulate the bonus token case
-        multi_step_continuations = [[
-            random.randint(0, 1000),
-            random.randint(0, 1000)
-        ] for _ in prompts]
-        final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
-
-        seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
-        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            continuations=multi_step_continuations,
-            final_prompt_lens=final_prompt_lens)
-
-        # Run multi-step.
-        zero_kv_cache(multi_step_worker.cache_engine)
-        multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list),
-                                         sample_len=num_steps,
-                                         seq_ids_with_bonus_token_in_last_step=
-                                         seq_ids_with_bonus_token_in_last_step)
-
-        # Run single-step repeatedly.
-        zero_kv_cache(worker.cache_engine)
-        # Generate the kv cache for the bonus token first
-        single_step_continuations = [c[:1] for c in multi_step_continuations]
-        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            continuations=single_step_continuations,
-            final_prompt_lens=final_prompt_lens)
-        single_step_output = worker.execute_model(
-            execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list))
-        for _ in range(num_steps):
-            seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-                prompts,
-                num_gpu_blocks,
-                block_size,
-                continuations=multi_step_continuations,
-                final_prompt_lens=final_prompt_lens)
-
-            single_step_output = worker.execute_model(
-                execute_model_req=ExecuteModelRequest(
-                    seq_group_metadata_list=seq_group_metadata_list))
-
-            for i, seq_group_output in enumerate(single_step_output[-1]):
-                multi_step_continuations[i].append(
-                    seq_group_output.samples[0].output_token)
-
-        # Verify that the KV cache of the single-step and
-        # multi-step workers are the same.
-        single_step_gpu_cache = worker.cache_engine[0].gpu_cache
-        multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
-        num_layers = len(single_step_gpu_cache)
-        allclose = lambda a, b: torch.allclose(
-            a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
-        for i in range(num_layers):
-            assert allclose(single_step_gpu_cache[i][0],
-                            multi_step_gpu_cache[i][0])
-            assert allclose(single_step_gpu_cache[i][1],
-                            multi_step_gpu_cache[i][1])
+# @torch.inference_mode()
+# @pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
+# # The choice of backends forces the multi_step_worker to choose between
+# # the vanilla model_runner and TP1DraftModelRunner and that we can test
+# # both code paths.
+# @pytest.mark.parametrize('attn_backend',
+#                          [_Backend.XFORMERS, _Backend.FLASH_ATTN])
+# def test_multi_step_correct_kvcache(num_steps, attn_backend):
+#     """Verify that the KV cache of the draft model 
+#     is correctly updated for sequences with bonus token.
+#     """
+#     seed = 100
+#     model_name = "JackFram/llama-68m"
+
+#     block_size = 16
+#     num_gpu_blocks = 2048 // block_size
+#     batch_size = 1
+
+#     with global_force_attn_backend_context_manager(attn_backend):
+#         dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
+#         multi_step_worker = create_worker(MultiStepWorker,
+#                                           model_name,
+#                                           block_size,
+#                                           num_gpu_blocks,
+#                                           seed,
+#                                           model_runner_cls=TP1DraftModelRunner,
+#                                           dtype=dtype)
+#         multi_step_worker.set_include_gpu_probs_tensor()
+#         worker = create_worker(Worker,
+#                                model_name,
+#                                block_size,
+#                                num_gpu_blocks,
+#                                seed,
+#                                dtype=dtype)
+
+#         prompts = [[0] for _ in range(batch_size)]
+#         # Already generate two tokens for the sequence
+#         # so that we can simulate the bonus token case
+#         multi_step_continuations = [[
+#             random.randint(0, 1000),
+#             random.randint(0, 1000)
+#         ] for _ in prompts]
+#         final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
+
+#         seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
+#         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+#             prompts,
+#             num_gpu_blocks,
+#             block_size,
+#             continuations=multi_step_continuations,
+#             final_prompt_lens=final_prompt_lens)
+
+#         # Run multi-step.
+#         zero_kv_cache(multi_step_worker.cache_engine)
+#         multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
+#             seq_group_metadata_list=seq_group_metadata_list),
+#                                          sample_len=num_steps,
+#                                          seq_ids_with_bonus_token_in_last_step=
+#                                          seq_ids_with_bonus_token_in_last_step)
+
+#         # Run single-step repeatedly.
+#         zero_kv_cache(worker.cache_engine)
+#         # Generate the kv cache for the bonus token first
+#         single_step_continuations = [c[:1] for c in multi_step_continuations]
+#         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+#             prompts,
+#             num_gpu_blocks,
+#             block_size,
+#             continuations=single_step_continuations,
+#             final_prompt_lens=final_prompt_lens)
+#         single_step_output = worker.execute_model(
+#             execute_model_req=ExecuteModelRequest(
+#                 seq_group_metadata_list=seq_group_metadata_list))
+#         for _ in range(num_steps):
+#             seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+#                 prompts,
+#                 num_gpu_blocks,
+#                 block_size,
+#                 continuations=multi_step_continuations,
+#                 final_prompt_lens=final_prompt_lens)
+
+#             single_step_output = worker.execute_model(
+#                 execute_model_req=ExecuteModelRequest(
+#                     seq_group_metadata_list=seq_group_metadata_list))
+
+#             for i, seq_group_output in enumerate(single_step_output[-1]):
+#                 multi_step_continuations[i].append(
+#                     seq_group_output.samples[0].output_token)
+
+#         # Verify that the KV cache of the single-step and
+#         # multi-step workers are the same.
+#         single_step_gpu_cache = worker.cache_engine[0].gpu_cache
+#         multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
+#         num_layers = len(single_step_gpu_cache)
+#         allclose = lambda a, b: torch.allclose(
+#             a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
+#         for i in range(num_layers):
+#             assert allclose(single_step_gpu_cache[i][0],
+#                             multi_step_gpu_cache[i][0])
+#             assert allclose(single_step_gpu_cache[i][1],
+#                             multi_step_gpu_cache[i][1])


 @torch.inference_mode()

--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -5,6 +5,7 @@ from collections import defaultdict
 from types import SimpleNamespace
 from unittest.mock import MagicMock

+import os
 import pytest
 import torch

@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
 from .test_utils import mock_spec_decode_sampler
 from .utils import (create_batch, create_sampler_output_list, create_worker,
                    mock_worker)
+from ..utils import models_path_prefix


 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
    num_gpu_blocks = 8096 // block_size
    target_worker = create_worker(
        Worker,
-        "JackFram/llama-68m",
+        os.path.join(models_path_prefix, "JackFram/llama-68m"),
        block_size,
        num_gpu_blocks,
        seed,
    )
    draft_worker = create_worker(
        MultiStepWorker,
-        "abhigoyal/vllm-eagle-llama-68m-random",
+        os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random"),
        block_size,
        num_gpu_blocks,
        seed,

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -7,6 +7,7 @@ import pathlib
 import subprocess
 from functools import partial
 from unittest.mock import MagicMock, patch
+from typing import List, Tuple, Optional

 import openai
 import pytest
@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
+from vllm.lora.request import LoRARequest
 # yapf conflicts with isort for this docstring
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
    )

-    model_ref = "meta-llama/Llama-2-7b-hf"
+    model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
    # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
    lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
    test_prompts = multilora_inference.create_test_prompts(lora_path)

--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama": {
        "model":
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "llama3_json", "--chat-template",
@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama3.2": {
        "model":
-        "meta-llama/Llama-3.2-3B-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-3B-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "llama3_json", "--chat-template",
@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama4": {
        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "pythonic", "--chat-template",
@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama4_json": {
        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
            "--distributed-executor-backend", "mp", "--tool-call-parser",
@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
    # },
    "granite-3.0-8b": {
        "model":
-        "ibm-granite/granite-3.0-8b-instruct",
+        os.path.join(models_path_prefix, "ibm-granite/granite-3.0-8b-instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "granite", "--chat-template",
@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "granite-3.1-8b": {
        "model":
-        "ibm-granite/granite-3.1-8b-instruct",
+        os.path.join(models_path_prefix, "ibm-granite/granite-3.1-8b-instruct"),
        "arguments": [
            "--enforce-eager",
            "--no-enable-prefix-caching",
@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "internlm": {
        "model":
-        "internlm/internlm2_5-7b-chat",
+        os.path.join(models_path_prefix, "internlm/internlm2_5-7b-chat"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "internlm", "--chat-template",
@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "toolACE": {
        "model":
-        "Team-ACE/ToolACE-8B",
+        os.path.join(models_path_prefix, "Team-ACE/ToolACE-8B"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "pythonic", "--chat-template",

--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -4,6 +4,8 @@ from dataclasses import dataclass

 import lm_eval
 import pytest
+import os
+from ..utils import models_path_prefix

 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
 # NOTE: Accuracy scores measured on GPUs.
 ACCURACY_CONFIGS = [
    GSM8KAccuracyTestConfig(
-        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        model_name=os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"),
        excepted_value=0.76),  # no bias
    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
    # so only one of these tests can run in a single call to pytest. As

--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
--- a/tests/weight_loading/__init__.py
+++ b/tests/weight_loading/__init__.py
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:

 def test_deepseek_mla_attn_backend_module():
    model_runner = _create_model_runner(
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"),
        trust_remote_code=True,
        enable_chunked_prefill=False,
    )