Merge tag 'v0.6.1.post2' into v0.6.1.post2-dev

ad58e9b3 · zhuwenwen · 408f663a · 9ba0817f · ad58e9b3 · ad58e9b3
Commit ad58e9b3 authored Sep 18, 2024 by zhuwenwen
20 changed files
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,11 +6,13 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+import os
 from contextlib import nullcontext
 import pytest
 from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test
 MODELS = [
    "facebook/opt-125m",
@@ -66,6 +68,59 @@ def test_models(
    )
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+) -> None:
+    if (model == "meta-llama/Llama-2-7b-hf"
+            and distributed_executor_backend == "ray"):
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 @pytest.mark.parametrize(
    "kv_cache_dtype,model",
    [("fp8_e4m3",

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -19,7 +19,10 @@ MODELS = [
    "facebook/opt-125m",
 ]
-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
        "tests/basic_correctness/test_preemption.py`")
@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute(
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
            worker_use_ray=worker_use_ray,
+            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt

--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -16,5 +16,7 @@ def test_full_graph(model):
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B")
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B",
+              enforce_eager=True,
+              load_format="dummy")
    llm.generate(prompts, sampling_params)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,8 +6,8 @@ import sys
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
-                    TypeVar, Union)
+                    TypedDict, TypeVar, Union)
 import numpy as np
 import pytest
@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
                          BatchFeature)
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
@@ -260,7 +261,7 @@ class HfRunner:
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
        is_embedding_model: bool = False,
-        auto_cls=AutoModelForCausalLM,
+        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[[BatchEncoding],
                                     BatchEncoding] = identity,
    ) -> None:
@@ -292,7 +293,6 @@ class HfRunner:
            trust_remote_code=True,
        )
-        try:
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
@@ -301,11 +301,6 @@ class HfRunner:
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
-        except Exception as exc:
-            logger.warning(
-                "Unable to auto-load HuggingFace processor for model (%s). "
-                "Using tokenizer instead. Reason: %s", model_name, exc)
-            self.processor = self.tokenizer
        self.postprocess_inputs = postprocess_inputs
@@ -658,8 +653,8 @@ class VllmRunner:
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs
+    @staticmethod
    def _final_steps_generate_w_logprobs(
-        self,
        req_outputs: List[RequestOutput],
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []

--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-cd $VLLM_PATH/tests
-pytest distributed/test_basic_distributed_correctness.py
-```
-"""
-import os
-import pytest
-from vllm.utils import cuda_device_count_stateless
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
-    ])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    test_suite: str,
-) -> None:
-    if test_suite != TARGET_TEST_SUITE:
-        pytest.skip(f"Skip test for {test_suite}")
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-    dtype = "half"
-    max_tokens = 5
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py
+++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py
-"""For encoder/decoder models only:
-Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-cd $VLLM_PATH/tests
-pytest distributed/test_basic_distributed_correctness_enc_dec.py
-```
-"""
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-from vllm.utils import cuda_device_count_stateless
-from ..conftest import DecoderPromptType
-from ..models.utils import check_logprobs_close
-from ..utils import fork_new_process_for_each_test
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/bart-large-cnn", "ray"),
-    ("facebook/bart-large-cnn", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    model: str,
-    distributed_executor_backend: str,
-    hf_runner,
-    vllm_runner,
-    example_encoder_decoder_prompts,
-) -> None:
-    '''
-    Test vLLM BART inference on more than one GPU, comparing
-    outputs against HF as a baseline.
-    Fork a new process for each test, to prevent CUDA from
-    being re-initialized by successive tests within the same
-    process.
-    Arguments:
-    * model: the HF ID of the specific BART variant under test
-    * distributed_executor_backend
-    * hf_runner: HuggingFace (HF) test model runner
-    * vllm_runner: vLLM test model runner
-    * example_encoder_decoder_prompts: test fixture which provides a 
-                                        dictionary of dummy prompts
-    '''
-    dtype = "float"
-    max_tokens = 64
-    num_logprobs = 5
-    # Example inputs with non-trivial (i.e. not None/empty) encoder &
-    # decoder prompts.
-    test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            test_prompts, max_tokens, num_logprobs)
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            test_prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-pytest test_chunked_prefill_distributed.py
-```
-"""
-import os
-import pytest
-from vllm.utils import cuda_device_count_stateless
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/opt-125m", "ray"),
-    ("meta-llama/Llama-2-7b-hf", "ray"),
-    ("facebook/opt-125m", "mp"),
-    ("meta-llama/Llama-2-7b-hf", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-) -> None:
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
-        assert distributed_executor_backend == "ray"
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -32,9 +32,11 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
+        # NOTE: InternVL2 multi-node tests are flaky,
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
+        # use mp backend to skip the multi-node tests
-        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
+        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
+        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
+        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
    ],
 )
 @fork_new_process_for_each_test

--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
 import os
-import torch
+import torch.distributed as dist
 from vllm.distributed.parallel_state import in_the_same_node_as
-torch.distributed.init_process_group(backend="gloo")
+if __name__ == "__main__":
-test_result = all(
+    dist.init_process_group(backend="gloo")
-    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
-expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-assert test_result == expected, f"Expected {expected}, got {test_result}"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
-print("Same node test passed!")
+    print("Same node test passed!")
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
    # token ids.
    llm = LLM(model=model, skip_tokenizer_init=True)
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-    with pytest.raises(ValueError) as err:
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
        llm.generate("abc", sampling_params)
-    assert "prompts must be None if" in str(err.value)
    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
                           sampling_params=sampling_params)
    assert len(outputs) > 0

--- a/tests/entrypoints/offline_mode/__init__.py
+++ b/tests/entrypoints/offline_mode/__init__.py
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
+"""Tests for HF_HUB_OFFLINE mode"""
+import importlib
+import sys
+import weakref
+import pytest
+from vllm import LLM
+from ...conftest import cleanup
+MODEL_NAME = "facebook/opt-125m"
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.10,
+              enforce_eager=True)
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup()
+@pytest.mark.skip_global_cleanup
+def test_offline_mode(llm: LLM, monkeypatch):
+    # we use the llm fixture to ensure the model files are in-cache
+    del llm
+    # Set HF to offline mode and ensure we can still construct an LLM
+    try:
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        # Need to re-import huggingface_hub and friends to setup offline mode
+        _re_import_modules()
+        # Cached model files should be used in offline mode
+        LLM(model=MODEL_NAME,
+            max_num_batched_tokens=4096,
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.10,
+            enforce_eager=True)
+    finally:
+        # Reset the environment after the test
+        # NB: Assuming tests are run in online mode
+        monkeypatch.delenv("HF_HUB_OFFLINE")
+        _re_import_modules()
+        pass
+def _re_import_modules():
+    hf_hub_module_names = [
+        k for k in sys.modules if k.startswith("huggingface_hub")
+    ]
+    transformers_module_names = [
+        k for k in sys.modules if k.startswith("transformers")
+        and not k.startswith("transformers_modules")
+    ]
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -10,7 +10,6 @@ import pytest
 import torch
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-from vllm.attention.backends.xformers import XFormersBackend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                        make_tensor_with_pad)
@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
    * Backend instance
    '''
    if backend_name == STR_XFORMERS_ATTN_VAL:
+        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
+        from vllm.attention.backends.xformers import XFormersBackend
        return XFormersBackend()
    raise AssertionError(
        f"Unrecognized backend_name {backend_name} for unit test")

--- a/tests/models/decoder_only/__init__.py
+++ b/tests/models/decoder_only/__init__.py
--- a/tests/models/decoder_only/audio_language/__init__.py
+++ b/tests/models/decoder_only/audio_language/__init__.py
--- a/tests/models/test_ultravox.py
+++ b/tests/models/test_ultravox.py
@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import HfRunner, VllmRunner
+from ....conftest import HfRunner, VllmRunner
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 MODEL_NAME = "fixie-ai/ultravox-v0_3"

--- a/tests/models/decoder_only/language/__init__.py
+++ b/tests/models/decoder_only/language/__init__.py
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
 import pytest
 import torch
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = [
    "meta-llama/Llama-2-7b-hf",

--- a/tests/models/test_danube3_4b.py
+++ b/tests/models/test_danube3_4b.py
@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
 """
 import pytest
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = ["h2oai/h2o-danube3-4b-base"]