[CI/Build] Reorganize models tests (#7820)

a84e598e · Cyrus Leung · GitHub · 0a4806f0 · a84e598e · a84e598e
Unverified Commit a84e598e authored Sep 14, 2024 by Cyrus Leung Committed by GitHub Sep 13, 2024
20 changed files
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,12 +23,10 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
+  pytest -v -s tests/models/decoder_only/language \
-      --ignore=tests/models/test_oot_registration.py \
+    --ignore=tests/models/test_fp8.py \
-      --ignore=tests/models/test_registry.py \
+    --ignore=tests/models/decoder_only/language/test_jamba.py \
-      --ignore=tests/models/test_fp8.py \
+    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
-      --ignore=tests/models/test_jamba.py \
-      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # Run compressed-tensor test
 docker exec cpu-test bash -c "

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -94,7 +94,6 @@ steps:
  - pytest -v -s entrypoints/test_chat_utils.py
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@@ -164,15 +163,6 @@ steps:
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py
- label: Models Test # 1hr10min
-  source_file_dependencies:
-  - vllm/
-  - tests/models
-  commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
 - label: torch compile integration test
  source_file_dependencies:
  - vllm/
@@ -180,14 +170,6 @@ steps:
    - pytest -v -s ./compile/test_full_graph.py
    - pytest -v -s ./compile/test_wrapper.py
- label: Vision Language Models Test # 42min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  commands:
-    - pytest -v -s models -m vlm
 - label: Prefix Caching Test # 7min
  #mirror_hardwares: [amd]
  source_file_dependencies:
@@ -286,6 +268,45 @@ steps:
  commands:
    - pytest -v -s tool_use
+#####  models test  #####
+- label: Basic Models Test # 3min
+  source_file_dependencies:
+  - vllm/
+  - tests/models
+  commands:
+    - pip install -e ./plugins/vllm_add_dummy_model
+    - pytest -v -s models/test_oot_registration.py # it needs a clean process
+    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+- label: Decoder-only Language Models Test # 1h3min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  commands:
+    - pytest -v -s models/decoder_only/language
+- label: Decoder-only Multi-Modal Models Test # 56min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pytest -v -s models/decoder_only/audio_language
+    - pytest -v -s models/decoder_only/vision_language
+- label: Other Models Test # 5min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/embedding/language
+    - pytest -v -s models/encoder_decoder/language
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -311,11 +332,11 @@ steps:
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
 - label: Distributed Tests (2 GPUs) # 28min
  #mirror_hardwares: [amd]
@@ -328,11 +349,10 @@ steps:
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
-  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
+  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest -v -s distributed/test_multimodal_broadcast.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py

--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -342,7 +342,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
 We have the following levels of testing for models:
-1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
 3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -85,5 +85,6 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "skip_global_cleanup",
-    "vlm: run tests for vision language models only",
+    "core_model: run this model test in each PR instead of just daily",
+    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
 ]
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -15,12 +15,15 @@ from vllm.utils import is_hip
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 from ..models.utils import check_outputs_equal
+from ..utils import multi_gpu_test
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
 ]
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
@@ -70,6 +73,65 @@ def test_models(
    )
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
+) -> None:
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+    if attention_backend:
+        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+    dtype = "half"
+    max_tokens = 5
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 def test_model_with_failure(vllm_runner) -> None:
    try:
        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,11 +6,13 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+import os
 from contextlib import nullcontext
 import pytest
 from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test
 MODELS = [
    "facebook/opt-125m",
@@ -66,6 +68,59 @@ def test_models(
    )
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+) -> None:
+    if (model == "meta-llama/Llama-2-7b-hf"
+            and distributed_executor_backend == "ray"):
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 @pytest.mark.parametrize(
    "kv_cache_dtype,model",
    [("fp8_e4m3",

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -19,10 +19,13 @@ MODELS = [
    "facebook/opt-125m",
 ]
-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-    "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+@pytest.fixture(scope="module", autouse=True)
-    "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+def check_settings():
-    "tests/basic_correctness/test_preemption.py`")
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+        "tests/basic_correctness/test_preemption.py`")
 @pytest.fixture

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,8 +6,8 @@ import sys
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
-                    TypeVar, Union)
+                    TypedDict, TypeVar, Union)
 import numpy as np
 import pytest
@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
                          BatchFeature)
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
@@ -260,7 +261,7 @@ class HfRunner:
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
        is_embedding_model: bool = False,
-        auto_cls=AutoModelForCausalLM,
+        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[[BatchEncoding],
                                     BatchEncoding] = identity,
    ) -> None:
@@ -292,20 +293,14 @@ class HfRunner:
            trust_remote_code=True,
        )
-        try:
+        # don't put this import at the top level
-            # don't put this import at the top level
+        # it will call torch.cuda.device_count()
-            # it will call torch.cuda.device_count()
+        from transformers import AutoProcessor  # noqa: F401
-            from transformers import AutoProcessor  # noqa: F401
+        self.processor = AutoProcessor.from_pretrained(
-            self.processor = AutoProcessor.from_pretrained(
+            model_name,
-                model_name,
+            torch_dtype=torch_dtype,
-                torch_dtype=torch_dtype,
+            trust_remote_code=True,
-                trust_remote_code=True,
+        )
-            )
-        except Exception as exc:
-            logger.warning(
-                "Unable to auto-load HuggingFace processor for model (%s). "
-                "Using tokenizer instead. Reason: %s", model_name, exc)
-            self.processor = self.tokenizer
        self.postprocess_inputs = postprocess_inputs

--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-cd $VLLM_PATH/tests
-pytest distributed/test_basic_distributed_correctness.py
-```
-"""
-import os
-import pytest
-from vllm.utils import cuda_device_count_stateless
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
-    ])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    test_suite: str,
-) -> None:
-    if test_suite != TARGET_TEST_SUITE:
-        pytest.skip(f"Skip test for {test_suite}")
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-    dtype = "half"
-    max_tokens = 5
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py
+++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py
-"""For encoder/decoder models only:
-Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-cd $VLLM_PATH/tests
-pytest distributed/test_basic_distributed_correctness_enc_dec.py
-```
-"""
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-from vllm.utils import cuda_device_count_stateless
-from ..conftest import DecoderPromptType
-from ..models.utils import check_logprobs_close
-from ..utils import fork_new_process_for_each_test
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/bart-large-cnn", "ray"),
-    ("facebook/bart-large-cnn", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    model: str,
-    distributed_executor_backend: str,
-    hf_runner,
-    vllm_runner,
-    example_encoder_decoder_prompts,
-) -> None:
-    '''
-    Test vLLM BART inference on more than one GPU, comparing
-    outputs against HF as a baseline.
-    Fork a new process for each test, to prevent CUDA from
-    being re-initialized by successive tests within the same
-    process.
-    Arguments:
-    * model: the HF ID of the specific BART variant under test
-    * distributed_executor_backend
-    * hf_runner: HuggingFace (HF) test model runner
-    * vllm_runner: vLLM test model runner
-    * example_encoder_decoder_prompts: test fixture which provides a 
-                                        dictionary of dummy prompts
-    '''
-    dtype = "float"
-    max_tokens = 64
-    num_logprobs = 5
-    # Example inputs with non-trivial (i.e. not None/empty) encoder &
-    # decoder prompts.
-    test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            test_prompts, max_tokens, num_logprobs)
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            test_prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-pytest test_chunked_prefill_distributed.py
-```
-"""
-import os
-import pytest
-from vllm.utils import cuda_device_count_stateless
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/opt-125m", "ray"),
-    ("meta-llama/Llama-2-7b-hf", "ray"),
-    ("facebook/opt-125m", "mp"),
-    ("meta-llama/Llama-2-7b-hf", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-) -> None:
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
-        assert distributed_executor_backend == "ray"
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
 import os
-import torch
+import torch.distributed as dist
 from vllm.distributed.parallel_state import in_the_same_node_as
-torch.distributed.init_process_group(backend="gloo")
+if __name__ == "__main__":
-test_result = all(
+    dist.init_process_group(backend="gloo")
-    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
-expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-assert test_result == expected, f"Expected {expected}, got {test_result}"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
-print("Same node test passed!")
+    print("Same node test passed!")
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -10,7 +10,6 @@ import pytest
 import torch
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-from vllm.attention.backends.xformers import XFormersBackend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                        make_tensor_with_pad)
@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
    * Backend instance
    '''
    if backend_name == STR_XFORMERS_ATTN_VAL:
+        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
+        from vllm.attention.backends.xformers import XFormersBackend
        return XFormersBackend()
    raise AssertionError(
        f"Unrecognized backend_name {backend_name} for unit test")

--- a/tests/models/decoder_only/__init__.py
+++ b/tests/models/decoder_only/__init__.py
--- a/tests/models/decoder_only/audio_language/__init__.py
+++ b/tests/models/decoder_only/audio_language/__init__.py
--- a/tests/models/test_ultravox.py
+++ b/tests/models/test_ultravox.py
@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import HfRunner, VllmRunner
+from ....conftest import HfRunner, VllmRunner
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 MODEL_NAME = "fixie-ai/ultravox-v0_3"

--- a/tests/models/decoder_only/language/__init__.py
+++ b/tests/models/decoder_only/language/__init__.py
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
 import pytest
 import torch
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = [
    "meta-llama/Llama-2-7b-hf",

--- a/tests/models/test_danube3_4b.py
+++ b/tests/models/test_danube3_4b.py
@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
 """
 import pytest
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = ["h2oai/h2o-danube3-4b-base"]