merge v0.5.0

f48954a4 · zhuwenwen · 1dba29d3 · 8f89d720 · f48954a4 · f48954a4
Commit f48954a4 authored Jun 12, 2024 by zhuwenwen
20 changed files
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -102,22 +102,21 @@ def batched_generate(
    return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
-@pytest.fixture
+@pytest.fixture(scope="module")
 def lora_llm(long_context_infos):
    scaling_factors = [
        context_len_to_scaling_factor[info["context_length"]]
        for info in long_context_infos.values()
    ]
-    llm = vllm.LLM(
+    llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
-        "meta-llama/Llama-2-13b-chat-hf",
+                   enable_lora=True,
-        enable_lora=True,
+                   max_num_seqs=16,
-        max_num_seqs=16,
+                   max_loras=2,
-        max_loras=2,
+                   long_lora_scaling_factors=tuple(scaling_factors),
-        long_lora_scaling_factors=tuple(scaling_factors),
+                   max_num_batched_tokens=4096 * 8,
-        max_num_batched_tokens=4096 * 8,
+                   tensor_parallel_size=4,
-        tensor_parallel_size=4,
+                   distributed_executor_backend="mp")
-    )
    yield llm
    del llm
@@ -154,6 +153,7 @@ def test_rotary_emb_replaced(dist_init):
    assert rotary_emb_count == 32
+@pytest.mark.skip_global_cleanup
 def test_batched_rope_kernel(lora_llm, long_context_infos):
    """We test the batched kernel by comparing the results of batched an
        non-batched generation.
@@ -188,6 +188,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
            f"same:\n{batched}\n{non_batched}")
+@pytest.mark.skip_global_cleanup
 def test_self_consistency(lora_llm, long_context_infos):
    """We test consistency of the batched kernel by permuting batched
    inputs and comparing the results to the non-permuted batched results.
@@ -227,6 +228,7 @@ def test_self_consistency(lora_llm, long_context_infos):
                f"\n{permutated_batched_results[permutation[i]]}")
+@pytest.mark.skip_global_cleanup
 def test_quality(lora_llm, long_context_infos):
    """We test the quality of the answers given by the LoRA model by
        comparing the generated text to the merged model's outputs.
@@ -257,6 +259,7 @@ def test_quality(lora_llm, long_context_infos):
    assert np.mean(scores) > 0.5
+@pytest.mark.skip_global_cleanup
 def test_max_len(lora_llm, long_context_infos):
    """Test that we raise an ValueError when the input of a given LoRA
        model exceeds the maximum length."""

--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
 from collections import OrderedDict
+import pytest
 from torch import nn
 from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
 from vllm.utils import LRUCache
-def test_parse_fine_tuned_lora_name():
+def test_parse_fine_tuned_lora_name_valid():
    fixture = {
        ("base_model.model.lm_head.lora_A.weight", "lm_head", True),
        ("base_model.model.lm_head.lora_B.weight", "lm_head", False),
@@ -35,6 +36,17 @@ def test_parse_fine_tuned_lora_name():
        assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
+def test_parse_fine_tuned_lora_name_invalid():
+    fixture = {
+        "weight",
+        "base_model.weight",
+        "base_model.model.weight",
+    }
+    for name in fixture:
+        with pytest.raises(ValueError, match="unsupported LoRA weight"):
+            parse_fine_tuned_lora_name(name)
 def test_replace_submodule():
    model = nn.Sequential(
        OrderedDict([

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
+    with vllm_runner(model,
-                             dtype=dtype,
+                     dtype=dtype,
-                             disable_log_stats=False,
+                     disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
+                     gpu_memory_utilization=0.4) as vllm_model:
-    tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.model.get_tokenizer()
-    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
+        prompt_token_counts = [
-    # This test needs at least 2 prompts in a batch of different lengths to
+            len(tokenizer.encode(p)) for p in example_prompts
-    # verify their token count is correct despite padding.
+        ]
-    assert len(example_prompts) > 1, "at least 2 prompts are required"
+        # This test needs at least 2 prompts in a batch of different lengths to
-    assert prompt_token_counts[0] != prompt_token_counts[1], (
+        # verify their token count is correct despite padding.
-        "prompts of different lengths are required")
+        assert len(example_prompts) > 1, "at least 2 prompts are required"
-    vllm_prompt_token_count = sum(prompt_token_counts)
+        assert prompt_token_counts[0] != prompt_token_counts[1], (
+            "prompts of different lengths are required")
-    _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_prompt_token_count = sum(prompt_token_counts)
-    stat_logger = vllm_model.model.llm_engine.stat_logger
-    metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        **stat_logger.labels)._value.get()
+        stat_logger = vllm_model.model.llm_engine.stat_logger
+        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+            **stat_logger.labels)._value.get()
    assert vllm_prompt_token_count == metric_count, (
        f"prompt token count: {vllm_prompt_token_count!r}\n"
@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
+    with vllm_runner(model,
-                             dtype=dtype,
+                     dtype=dtype,
-                             disable_log_stats=False,
+                     disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
+                     gpu_memory_utilization=0.4) as vllm_model:
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.model.get_tokenizer()
-    stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_logger
-    metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-        **stat_logger.labels)._value.get()
+            **stat_logger.labels)._value.get()
-    vllm_generation_count = 0
+        vllm_generation_count = 0
-    for i in range(len(example_prompts)):
+        for i in range(len(example_prompts)):
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+            vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        prompt_ids = tokenizer.encode(example_prompts[i])
+            prompt_ids = tokenizer.encode(example_prompts[i])
-        # vllm_output_ids contains both prompt tokens and generation tokens.
+            # vllm_output_ids contains both prompt tokens and generation tokens.
-        # We're interested only in the count of the generation tokens.
+            # We're interested only in the count of the generation tokens.
-        vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
    assert vllm_generation_count == metric_count, (
        f"generation token count: {vllm_generation_count!r}\n"
@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
 def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                                   served_model_name: List[str]) -> None:
-    vllm_model = vllm_runner(model,
+    with vllm_runner(model,
-                             dtype=dtype,
+                     dtype=dtype,
-                             disable_log_stats=False,
+                     disable_log_stats=False,
-                             gpu_memory_utilization=0.3,
+                     gpu_memory_utilization=0.3,
-                             served_model_name=served_model_name)
+                     served_model_name=served_model_name) as vllm_model:
-    stat_logger = vllm_model.model.llm_engine.stat_logger
+        stat_logger = vllm_model.model.llm_engine.stat_logger
-    metrics_tag_content = stat_logger.labels["model_name"]
+        metrics_tag_content = stat_logger.labels["model_name"]
-    del vllm_model
    if served_model_name is None or served_model_name == []:
        assert metrics_tag_content == model, (

--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -8,10 +8,13 @@ import torch
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-capability = torch.cuda.get_device_capability()
+aqlm_not_supported = True
-capability = capability[0] * 10 + capability[1]
-aqlm_not_supported = (capability <
+if torch.cuda.is_available():
-                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    aqlm_not_supported = (capability <
+                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
@@ -79,10 +82,9 @@ def test_models(
    num_logprobs: int,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
-                                                       max_tokens,
+            example_prompts, max_tokens, num_logprobs)
-                                                       num_logprobs)
    # loop through the prompts to compare against the ground truth generations
    for prompt_idx in range(len(example_prompts)):

--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -5,6 +5,7 @@ This tests bigger models and use half precision.
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
+import torch
 MODELS = [
    "meta-llama/Llama-2-7b-hf",
@@ -16,9 +17,14 @@ MODELS = [
    # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]
+#TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [32])
 def test_models(
    hf_runner,
@@ -28,13 +34,11 @@ def test_models(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    hf_model = hf_runner(model, dtype=dtype)
+    with hf_runner(model, dtype=dtype) as hf_model:
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@@ -46,15 +50,14 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 def test_model_print(
    vllm_runner,
    model: str,
    dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    # This test is for verifying whether the model's extra_repr
+        # This test is for verifying whether the model's extra_repr
-    # can be printed correctly.
+        # can be printed correctly.
-    print(vllm_model.model.llm_engine.model_executor.driver_worker.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-          model_runner.model)
+              model_runner.model)
-    del vllm_model
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@@ -28,13 +28,11 @@ def test_models(
    model: str,
    dtype: str,
 ) -> None:
-    hf_model = hf_runner(model, dtype=dtype)
+    with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
-    hf_outputs = hf_model.encode(example_prompts)
+        hf_outputs = hf_model.encode(example_prompts)
-    del hf_model
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.encode(example_prompts)
-    del vllm_model
    similarities = compare_embeddings(hf_outputs, vllm_outputs)
    all_similarities = torch.stack(similarities)

--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -67,10 +67,13 @@ EXPECTED_STRS_MAP = {
    },
 }
-capability = torch.cuda.get_device_capability()
+fp8_not_supported = True
-capability = capability[0] * 10 + capability[1]
-fp8_not_supported = (capability <
+if torch.cuda.is_available():
-                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    fp8_not_supported = (capability <
+                         QUANTIZATION_METHODS["fp8"].get_min_capability())
 @pytest.mark.skipif(fp8_not_supported,

--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -22,10 +22,13 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024
-capability = torch.cuda.get_device_capability()
+gptq_marlin_not_supported = True
-capability = capability[0] * 10 + capability[1]
-gptq_marlin_not_supported = (
+if torch.cuda.is_available():
-    capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    gptq_marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
 MODELS = [
    # act_order==False, group_size=channelwise
@@ -67,32 +70,29 @@ def test_models(
    model_name, revision = model
    # Run marlin.
-    gptq_marlin_model = vllm_runner(model_name=model_name,
+    with vllm_runner(model_name=model_name,
-                                    revision=revision,
+                     revision=revision,
-                                    dtype=dtype,
+                     dtype=dtype,
-                                    quantization="marlin",
+                     quantization="marlin",
-                                    max_model_len=MAX_MODEL_LEN,
+                     max_model_len=MAX_MODEL_LEN,
-                                    tensor_parallel_size=1)
+                     tensor_parallel_size=1) as gptq_marlin_model:
-    gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
+        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-        example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs)
-    del gptq_marlin_model
    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
    # Run gptq.
    # The naive gptq kernel doesn't support bf16 yet.
    # Here we always compare fp16/bf16 gpt marlin kernel
    # to fp16 gptq kernel.
-    gptq_model = vllm_runner(model_name=model_name,
+    with vllm_runner(model_name=model_name,
-                             revision=revision,
+                     revision=revision,
-                             dtype="half",
+                     dtype="half",
-                             quantization="gptq",
+                     quantization="gptq",
-                             max_model_len=MAX_MODEL_LEN,
+                     max_model_len=MAX_MODEL_LEN,
-                             tensor_parallel_size=1)
+                     tensor_parallel_size=1) as gptq_model:
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1],
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
-                                                       max_tokens,
+            example_prompts[:-1], max_tokens, num_logprobs)
-                                                       num_logprobs)
-    del gptq_model
    check_logprobs_close(
        outputs_0_lst=gptq_outputs,

--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -14,10 +14,13 @@ import torch
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-capability = torch.cuda.get_device_capability()
+marlin_not_supported = True
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
+if torch.cuda.is_available():
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 @dataclass
@@ -58,20 +61,16 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    marlin_24_model = vllm_runner(model_pair.model_marlin,
+    with vllm_runner(model_pair.model_marlin,
-                                  dtype=dtype,
+                     dtype=dtype,
-                                  quantization="gptq_marlin_24")
+                     quantization="gptq_marlin_24") as marlin_24_model:
-    marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
+        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs)
-    del marlin_24_model
-    gptq_model = vllm_runner(model_pair.model_gptq,
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                             dtype=dtype,
+                     quantization="gptq") as gptq_model:
-                             quantization="gptq")
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+            example_prompts, max_tokens, num_logprobs)
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
    check_logprobs_close(
        outputs_0_lst=gptq_outputs,

--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
-import gc
+from typing import List, Tuple
-from dataclasses import fields
-from enum import Enum
-from typing import Dict, List, Tuple
 import pytest
-import torch
 from transformers import AutoTokenizer
 from vllm.config import VisionLanguageConfig
-model_and_vl_config = [
+from ..conftest import IMAGE_FILES
-    ("llava-hf/llava-1.5-7b-hf",
-     VisionLanguageConfig(
+pytestmark = pytest.mark.llava
-         image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-         image_feature_size=576,
+# The image token is placed before "user" on purpose so that the test can pass
-         image_token_id=32000,
+HF_IMAGE_PROMPTS = [
-         image_input_shape=(1, 3, 336, 336))),
+    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
-    ("llava-hf/llava-1.5-7b-hf",
+    "<image>\nUSER: What is the season?\nASSISTANT:",
-     VisionLanguageConfig(
-         image_input_type=VisionLanguageConfig.ImageInputType.IMAGE_FEATURES,
-         image_feature_size=576,
-         image_token_id=32000,
-         image_input_shape=(1, 576, 1024)))
 ]
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
-def as_dict(vision_language_config: VisionLanguageConfig) -> Dict:
-    """Flatten vision language config to pure args.
-    Compatible with what llm entrypoint expects.
+def iter_llava_configs(model_name: str):
-    """
+    image_hw_to_feature_size = {
-    result = {}
+        (336, 336): 576,
-    for field in fields(vision_language_config):
+    }
-        value = getattr(vision_language_config, field.name)
-        if isinstance(value, Enum):
+    for (h, w), f in image_hw_to_feature_size.items():
-            result[field.name] = value.name.lower()
+        for input_type, input_shape in [
-        elif isinstance(value, tuple):
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            result[field.name] = ",".join([str(item) for item in value])
+            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
-        else:
+        ]:
-            result[field.name] = value
+            yield (model_name,
-    return result
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=32000,
-def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
+                                        image_input_shape=input_shape,
-                         vision_language_config: VisionLanguageConfig,
+                                        image_processor=model_name,
-                         model_id: str):
+                                        image_processor_revision=None))
+model_and_vl_config = [
+    *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
+]
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
    """Sanitize vllm output to be comparable with hf output.
    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
    It also reduces `output_str` from "<image><image>bla" to "bla".
    """
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(vision_language_config.image_token_id)
-    image_token_str_len = len(image_token_str)
    input_ids, output_str = vllm_output
-    sanitized_input_ids = input_ids[0:2] + input_ids[2 + vision_language_config
+    image_token_id = vlm_config.image_token_id
-                                                     .image_feature_size - 1:]
-    sanitzied_output_str = output_str[vision_language_config.
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
-                                      image_feature_size *
+    image_token_str = tokenizer.decode(image_token_id)
-                                      image_token_str_len:]
-    return sanitized_input_ids, sanitzied_output_str
+    hf_input_ids = [
+        input_id for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    ]
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, "")
-@pytest.mark.parametrize("worker_use_ray", [False])
+    return hf_input_ids, hf_output_str
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                vllm_image_prompts, vllm_images, model_and_config: tuple,
+                model_and_config, dtype: str, max_tokens: int) -> None:
-                dtype: str, max_tokens: int, worker_use_ray: bool) -> None:
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the raw images as input.
+    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide image tensors and corresponding
+    For vllm runner, we provide MultiModalData objects and corresponding
    vision language config as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
-    model_id, vision_language_config = model_and_config
+    model_id, vlm_config = model_and_config
-    hf_model = hf_runner(model_id, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(hf_image_prompts,
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-                                          max_tokens,
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                          images=hf_images)
-    del hf_model
-    vllm_model = vllm_runner(model_id,
-                             dtype=dtype,
-                             worker_use_ray=worker_use_ray,
-                             **as_dict(vision_language_config))
-    vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                              max_tokens,
-                                              images=vllm_images)
+                                              images=hf_images)
-    del vllm_model
+    vllm_image_prompts = [
+        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
+        for p in HF_IMAGE_PROMPTS
+    ]
-    gc.collect()
+    with vllm_runner(model_id,
-    torch.cuda.empty_cache()
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
-    for i in range(len(hf_image_prompts)):
+    for i in range(len(HF_IMAGE_PROMPTS)):
        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = sanitize_vllm_output(
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vision_language_config, model_id)
+            vllm_outputs[i], vlm_config, model_id)
        assert hf_output_str == vllm_output_str, (
            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
        assert hf_output_ids == vllm_output_ids, (

--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
+from typing import List, Tuple
+import pytest
+from transformers import AutoTokenizer
+from vllm.config import VisionLanguageConfig
+from ..conftest import IMAGE_FILES
+pytestmark = pytest.mark.llava
+_PREFACE = (
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's "
+    "questions.")
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
+    f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
+]
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+def iter_llava_next_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (336, 336): 1176,
+        (672, 672): 2928,
+        (1344, 336): 1944,
+        (336, 1344): 1890,
+    }
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=32000,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+model_and_vl_config = [
+    *iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
+]
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image><image>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+    hf_input_ids = [
+        input_id for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    ]
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, " ")
+    return hf_input_ids, hf_output_str
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
+    """Inference result should be the same between hf and vllm.
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+    vllm_image_prompts = [
+        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
+        for p in HF_IMAGE_PROMPTS
+    ]
+    with vllm_runner(
+            model_id,
+            dtype=dtype,
+            # should be greater than image_feature_size
+            max_model_len=4096,
+            enforce_eager=True,
+            **vlm_config.as_cli_args_dict(),
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id)
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -19,10 +19,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from .utils import check_logprobs_close
-capability = torch.cuda.get_device_capability()
+marlin_not_supported = True
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
+if torch.cuda.is_available():
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 @dataclass
@@ -56,20 +59,16 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    marlin_model = vllm_runner(model_pair.model_marlin,
+    with vllm_runner(model_pair.model_marlin,
-                               dtype=dtype,
+                     dtype=dtype,
-                               quantization="marlin")
+                     quantization="marlin") as marlin_model:
-    marlin_outputs = marlin_model.generate_greedy_logprobs(
+        marlin_outputs = marlin_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs)
-    del marlin_model
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-    gptq_model = vllm_runner(model_pair.model_gptq,
+                     quantization="gptq") as gptq_model:
-                             dtype=dtype,
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
-                             quantization="gptq")
+            example_prompts, max_tokens, num_logprobs)
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
    check_logprobs_close(
        outputs_0_lst=gptq_outputs,

--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -26,16 +26,13 @@ def test_models(
    num_logprobs: int,
 ) -> None:
    # TODO(sang): Sliding window should be tested separately.
-    hf_model = hf_runner(model, dtype=dtype)
+    with hf_runner(model, dtype=dtype) as hf_model:
-    hf_outputs = hf_model.generate_greedy_logprobs_limit(
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-        example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs)
-    del hf_model
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
-                                                       max_tokens,
+            example_prompts, max_tokens, num_logprobs)
-                                                       num_logprobs)
-    del vllm_model
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,

--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -34,13 +34,11 @@ def test_models(
    # To pass the small model tests, we need full precision.
    assert dtype == "float"
-    hf_model = hf_runner(model, dtype=dtype)
+    with hf_runner(model, dtype=dtype) as hf_model:
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@@ -58,9 +56,8 @@ def test_model_print(
    model: str,
    dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    # This test is for verifying whether the model's extra_repr
+        # This test is for verifying whether the model's extra_repr
-    # can be printed correctly.
+        # can be printed correctly.
-    print(vllm_model.model.llm_engine.model_executor.driver_worker.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-          model_runner.model)
+              model_runner.model)
-    del vllm_model
--- a/tests/multimodal/__init__.py
+++ b/tests/multimodal/__init__.py
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
+import numpy as np
+import pytest
+from transformers import CLIPImageProcessor, LlavaNextImageProcessor
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import ImagePixelData
+from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
+@pytest.mark.parametrize("dtype", ["half", "float"])
+def test_clip_image_processor(hf_images, dtype):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
+    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, CLIPImageProcessor)
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=32000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=576,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+    for image in hf_images:
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        vllm_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("dtype", ["half", "float"])
+def test_llava_next_image_processor(hf_images, dtype):
+    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
+    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, LlavaNextImageProcessor)
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=64000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=2928,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+    for image in hf_images:
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        vllm_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+@pytest.mark.xfail(
+    reason="Example image pixels were not processed using HuggingFace")
+@pytest.mark.parametrize("dtype", ["float"])
+def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=32000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=576,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+    for image, tensor in zip(hf_images, vllm_image_tensors):
+        image_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+        tensor_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(tensor),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+        assert image_result.keys() == tensor_result.keys()
+        for key, image_arr in image_result.items():
+            tensor_arr: np.ndarray = tensor_result[key].numpy()
+            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
+            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
+import base64
+import mimetypes
+from tempfile import NamedTemporaryFile
+from typing import Dict, Tuple
+import numpy as np
+import pytest
+import pytest_asyncio
+from PIL import Image
+from vllm.multimodal.utils import ImageFetchAiohttp
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+@pytest_asyncio.fixture(scope="session")
+async def url_images() -> Dict[str, Image.Image]:
+    return {
+        image_url: await ImageFetchAiohttp.fetch_image(image_url)
+        for image_url in TEST_IMAGE_URLS
+    }
+def get_supported_suffixes() -> Tuple[str, ...]:
+    # We should at least test the file types mentioned in GPT-4 with Vision
+    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
+    # Additional file types that are supported by us
+    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
+    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
+def _image_equals(a: Image.Image, b: Image.Image) -> bool:
+    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("suffix", get_supported_suffixes())
+async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
+                                  image_url: str, suffix: str):
+    url_image = url_images[image_url]
+    try:
+        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
+    except KeyError:
+        try:
+            mime_type = mimetypes.types_map[suffix]
+        except KeyError:
+            pytest.skip('No MIME type')
+    with NamedTemporaryFile(suffix=suffix) as f:
+        try:
+            url_image.save(f.name)
+        except Exception as e:
+            if e.args[0] == 'cannot write mode RGBA as JPEG':
+                pytest.skip('Conversion not supported')
+            raise
+        base64_image = base64.b64encode(f.read()).decode("utf-8")
+        data_url = f"data:{mime_type};base64,{base64_image}"
+        data_image = await ImageFetchAiohttp.fetch_image(data_url)
+        if _image_equals(url_image, Image.open(f)):
+            assert _image_equals(url_image, data_image)
+        else:
+            pass  # Lossy format; only check that image can be opened
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
+'''Tests whether bitsandbytes computation is enabled correctly.
+Run `pytest tests/quantization/test_bitsandbytes.py`.
+'''
+import pytest
+import torch
+from vllm import SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
+    reason='bitsandbytes is not supported on this GPU type.')
+def test_load_bnb_model(vllm_runner) -> None:
+    with vllm_runner('huggyllama/llama-7b',
+                     quantization='bitsandbytes',
+                     load_format='bitsandbytes',
+                     enforce_eager=True) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        # check the weights in MLP & SelfAttention are quantized to torch.uint8
+        qweight = model.model.layers[0].mlp.gate_up_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
+        qweight = model.model.layers[0].mlp.down_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
+        qweight = model.model.layers[0].self_attn.o_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
+        qweight = model.model.layers[0].self_attn.qkv_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
+        # some weights should not be quantized
+        weight = model.lm_head.weight
+        assert weight.dtype != torch.uint8, (
+            'lm_head weight dtype should not be torch.uint8')
+        weight = model.model.embed_tokens.weight
+        assert weight.dtype != torch.uint8, (
+            'embed_tokens weight dtype should not be torch.uint8')
+        weight = model.model.layers[0].input_layernorm.weight
+        assert weight.dtype != torch.uint8, (
+            'input_layernorm weight dtype should not be torch.uint8')
+        weight = model.model.layers[0].post_attention_layernorm.weight
+        assert weight.dtype != torch.uint8, (
+            'input_layernorm weight dtype should not be torch.uint8')
+        # check the output of the model is expected
+        sampling_params = SamplingParams(temperature=0.0,
+                                         logprobs=1,
+                                         prompt_logprobs=1,
+                                         max_tokens=8)
+        prompts = ['That which does not kill us', 'To be or not to be,']
+        expected_outputs = [
+            'That which does not kill us makes us stronger.',
+            'To be or not to be, that is the question.'
+        ]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+        assert len(outputs) == len(prompts)
+        for index in range(len(outputs)):
+            # compare the first line of the output
+            actual_output = outputs[index][1][0].split('\n', 1)[0]
+            expected_output = expected_outputs[index].split('\n', 1)[0]
+            assert actual_output == expected_output, (
+                f'Expected: {expected_output}, but got: {actual_output}')
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -5,32 +5,58 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
 import torch
+from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
-    llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True)
+    with vllm_runner(model_path, enforce_eager=True) as llm:
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-    layer = model.model.layers[0]
+        layer = model.model.layers[0]
-    qkv_proj = layer.self_attn.qkv_proj
+        qkv_proj = layer.self_attn.qkv_proj
-    o_proj = layer.self_attn.o_proj
+        o_proj = layer.self_attn.o_proj
-    gate_up_proj = layer.mlp.gate_up_proj
+        gate_up_proj = layer.mlp.gate_up_proj
-    down_proj = layer.mlp.down_proj
+        down_proj = layer.mlp.down_proj
-    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(gate_up_proj.quant_method,
-    assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
+                          CompressedTensorsLinearMethod)
+        assert isinstance(down_proj.quant_method,
-    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
+                          CompressedTensorsLinearMethod)
-    assert qkv_proj.weight.dtype is torch.int8
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
-    assert o_proj.weight.dtype is torch.int8
-    assert gate_up_proj.weight.dtype is torch.int8
+        assert qkv_proj.weight.dtype is torch.int8
+        assert o_proj.weight.dtype is torch.int8
-    assert qkv_proj.weight_scale.shard_splitter is not None
+        assert gate_up_proj.weight.dtype is torch.int8
-    assert qkv_proj.weight_scale.logical_widths is not None
-    assert qkv_proj.input_scale.dtype is torch.float32
+        assert qkv_proj.weight_scale.shard_splitter is not None
+        assert qkv_proj.weight_scale.logical_widths is not None
+        assert qkv_proj.input_scale.dtype is torch.float32
+def test_compressed_tensors_no_enforce_eager(vllm_runner):
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    with vllm_runner(model_path) as llm:
+        sampling_params = SamplingParams()
+        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        assert output
+def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
+    with vllm_runner(model_path, enforce_eager=True,
+                     dtype=torch.float16) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert qkv_proj.weight.dtype is torch.int8
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
-    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-    fc1 = model.model.decoder.layers[0].fc1
+        fc1 = model.model.decoder.layers[0].fc1
-    assert isinstance(fc1.quant_method, Fp8LinearMethod)
+        assert isinstance(fc1.quant_method, Fp8LinearMethod)
-    assert fc1.weight.dtype == torch.float8_e4m3fn
+        assert fc1.weight.dtype == torch.float8_e4m3fn