update tests

2ff1c360 · zhuwenwen · 201768d5 · 2ff1c360 · 2ff1c360 · 2ff1c360
Commit 2ff1c360 authored Nov 28, 2024 by zhuwenwen
15 changed files
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -37,7 +37,7 @@ from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                        identity, is_cpu)
-from utils import models_path_prefix
+from .utils import models_path_prefix

 logger = init_logger(__name__)


--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -2,6 +2,7 @@ from http import HTTPStatus

 import openai
 import pytest
+import os
 import pytest_asyncio
 import requests


--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
 # imports for guided decoding tests
 import re
+import os

 import openai
 import pytest

--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -24,12 +24,15 @@ MAX_MODEL_LEN = 1024

 MODELS = [
    # act_order==False, group_size=channelwise
-    (os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
+    # (os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
+    (os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq")),
    # act_order==False, group_size=128
-    ("TheBloke/Llama-2-7B-GPTQ", "main"),
+    # ("TheBloke/Llama-2-7B-GPTQ", "main"),
+    ("TheBloke/Llama-2-7B-GPTQ"),

    # act_order==True, group_size=128
-    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
+    # (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
+    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ")),
    # act_order==True, group_size=64
    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-64g-actorder_True"),
    # act_order==True, group_size=32
@@ -43,7 +46,8 @@ MODELS = [
    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-32g-actorder_True"),

    # 4-bit, act_order==True, group_size=128
-    (os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
+    # (os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
+    (os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"))
 ]



--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -12,6 +12,7 @@ import torch
 from tests.quantization.utils import is_quant_method_supported

 from ..utils import fork_new_process_for_each_test, models_path_prefix
+from vllm.utils import is_hip


 models_4bit_to_test = [
@@ -30,7 +31,7 @@ models_pre_quant_8bit_to_test = [
 ]


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
 @fork_new_process_for_each_test
@@ -42,7 +43,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, hf_model_kwargs)


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                         models_pre_qaunt_4bit_to_test)
@@ -54,7 +55,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name)


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                         models_pre_quant_8bit_to_test)
@@ -68,7 +69,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,

 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason='Test requires at least 2 GPUs.')
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
 @fork_new_process_for_each_test

--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -7,9 +7,10 @@ import os
 from tests.quantization.utils import is_quant_method_supported

 from ..utils import compare_two_settings, models_path_prefix
+from vllm.utils import is_hip


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
                    reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
    # Test quantization of an unquantized checkpoint
@@ -23,7 +24,7 @@ def test_cpu_offload_fp8():
                         max_wait_seconds=480)


-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
                    reason="gptq_marlin is not supported on this GPU type.")
 def test_cpu_offload_gptq():
    # Test GPTQ Marlin
@@ -37,7 +38,7 @@ def test_cpu_offload_gptq():
                         max_wait_seconds=480)


-@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
+@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or is_hip(),
                    reason="awq_marlin is not supported on this GPU type.")
 def test_cpu_offload_awq():
    # Test AWQ Marlin
@@ -51,7 +52,7 @@ def test_cpu_offload_awq():
                         max_wait_seconds=480)


-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
                    reason="gptq_marlin is not supported on this GPU type.")
 def test_cpu_offload_compressed_tensors():
    # Test wNa16

--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -7,11 +7,12 @@ import os

 from tests.quantization.utils import is_quant_method_supported
 from ..utils import models_path_prefix
+from vllm.utils import is_hip

 MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]


-@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
+@pytest.mark.skipif(not is_quant_method_supported("experts_int8") or is_hip(),
                    reason="ExpertsInt8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])

--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -12,6 +12,7 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
                                                         Fp8LinearMethod)
 from vllm.platforms import current_platform
 from ..utils import models_path_prefix
+from vllm.utils import is_hip

 MODELS = [
    os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
@@ -20,7 +21,7 @@ MODELS = [
 ]


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
@@ -45,7 +46,7 @@ KV_CACHE_MODELS = [
 ]


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
 def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
@@ -66,7 +67,7 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
        print(outputs[0][1])


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
@@ -97,7 +98,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
            assert fc1.weight.dtype == torch.int32


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_scaled_fp8_quant(dtype) -> None:

--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -20,8 +20,8 @@ PROMPT = "On the surface of Mars, we found"

 MODELS_QUANT = [(
    os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"),
-    True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False),
-                (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]
+    True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False)]
+                # (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]


 @pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -7,6 +7,7 @@ import torch.nn.functional as F

 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
+from vllm.utils import is_hip

 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
@@ -43,8 +44,7 @@ def mock_causal_accepted_tensor(
    "which_tokens_accepted",
    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-# @pytest.mark.parametrize("use_flashinfer", [True, False])
-@pytest.mark.parametrize("use_flashinfer", [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
 @torch.inference_mode()
 def test_correct_output_format(which_tokens_accepted: str, seed: int,
                               device: str, use_flashinfer: bool):
@@ -128,8 +128,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-# @pytest.mark.parametrize("use_flashinfer", [True, False])
-@pytest.mark.parametrize("use_flashinfer", [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
 @torch.inference_mode()
 def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                    device: str, use_flashinfer: bool):
@@ -161,8 +160,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
 @pytest.mark.parametrize("n_rep", [100])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-# @pytest.mark.parametrize("use_flashinfer", [True, False])
-@pytest.mark.parametrize("use_flashinfer", [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
 @torch.inference_mode()
 def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                                   frac_seeded: float, n_rep: int, device: str,
@@ -240,8 +238,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
            for i in range(batch_size)
        }

-    # for use_flashinfer in [True, False]:
-    for use_flashinfer in [False]:
+    for use_flashinfer in [True, False] if not is_hip() else [False]:
        rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
        rejection_sampler.init_gpu_tensors(device=device)
        # We use seeded sequences to ensure the same tokens are accepted
@@ -262,8 +259,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
 @pytest.mark.parametrize("which_token_ids",
                         ["bonus_token_ids", "draft_token_ids"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-# @pytest.mark.parametrize("use_flashinfer", [True, False])
-@pytest.mark.parametrize("use_flashinfer", [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
 @torch.inference_mode()
 def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
                               which_token_ids: str, device: str,
@@ -315,8 +311,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,

 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
 @pytest.mark.parametrize("seed", list(range(5)))
-# @pytest.mark.parametrize("use_flashinfer", [True, False])
-@pytest.mark.parametrize("use_flashinfer", [False])
+@pytest.mark.parametrize("use_flashinfer", [True, False]  if not is_hip() else [False])
 @torch.inference_mode()
 def test_rejection_sampling_approximates_target_distribution(
        seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):

--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -89,7 +89,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize("model, test_llm_kwargs",
-                         [("JackFram/llama-68m", [
+                         [(os.path.join(models_path_prefix, "JackFram/llama-68m"), [
                             "--speculative-model",
                             os.path.join(models_path_prefix, "JackFram/llama-68m"),
                             "--num_speculative-tokens",
@@ -97,9 +97,11 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
                             "--speculative-draft-tensor-parallel-size",
                             "1",
                         ]),
-                          (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
+                        #   (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
+                          (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct-2k"), [
                              "--speculative-model",
-                              os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
+                            #   os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
+                              os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct-2k"),
                              "--num_speculative-tokens",
                              "5",
                              "--speculative-draft-tensor-parallel-size",

--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -261,7 +261,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
            "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
        },
        {
-            "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m")",
+            "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
        },
    ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -10,7 +10,7 @@ import pytest
 import torch
 from tensorizer import EncryptionParams

-from vllm import SamplingParams
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
@@ -25,6 +25,9 @@ from ..conftest import VllmRunner
 from ..utils import RemoteOpenAIServer, models_path_prefix
 from .conftest import retry_until_skip

+from typing import List, Optional, Tuple
+from vllm.lora.request import LoRARequest
+
 # yapf conflicts with isort for this docstring


@@ -155,11 +158,92 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
        assert outputs == deserialized_outputs


+def create_test_prompts(
+        lora_path: str
+) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    """Create a list of test prompts with their sampling parameters.
+
+    2 requests for base model, 4 requests for the LoRA. We define 2
+    different LoRA adapters (using the same model for demo purposes).
+    Since we also set `max_loras=1`, the expectation is that the requests
+    with the second LoRA adapter will be ran after all requests with the
+    first adapter have finished.
+    """
+    return [
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128), None),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        max_tokens=128), None),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0.0,
+                           logprobs=1,
+                           prompt_logprobs=1,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+            SamplingParams(n=3,
+                           best_of=3,
+                           use_beam_search=True,
+                           temperature=0,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0.0,
+                           logprobs=1,
+                           prompt_logprobs=1,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora2", 2, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+            SamplingParams(n=3,
+                           best_of=3,
+                           use_beam_search=True,
+                           temperature=0,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams,
+                                              Optional[LoRARequest]]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               lora_request=lora_request)
+            request_id += 1
+
+        request_outputs: List[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+
+
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    # from huggingface_hub import snapshot_download

-    from examples.multilora_inference import (create_test_prompts,
-                                              process_requests)
+    # from examples.multilora_inference import (create_test_prompts,
+    #                                           process_requests)

    model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
    # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")

--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -14,7 +14,8 @@ TOKENIZER_NAMES = [
 @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
 def test_tokenizer_revision(tokenizer_name: str):
    # Assume that "main" branch always exists
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+    # tokenizer = get_tokenizer(tokenizer_name, revision="main")
+    tokenizer = get_tokenizer(tokenizer_name)
    assert isinstance(tokenizer, PreTrainedTokenizerBase)

    # Assume that "never" branch always does not exist

--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -15,7 +15,7 @@ def test_weight_loading(vllm_runner):
    Test parameter weight loading with tp>1.
    """
    with vllm_runner(model_name=MODEL_NAME,
-                     revision=REVISION,
+                    #  revision=REVISION,
                     dtype=torch.half if QUANTIZATION == "gptq" else "auto",
                     quantization=QUANTIZATION,
                     max_model_len=MAX_MODEL_LEN,