[CI/Build] Update CPU tests to include all "standard" tests (#5481)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[CI/Build] Update CPU tests to include all "standard" tests (#5481)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
b489fc3c · Cyrus Leung · GitHub · 208ce622 · b489fc3c · b489fc3c
Unverified Commit b489fc3c authored Nov 08, 2024 by Cyrus Leung Committed by GitHub Nov 08, 2024
14 changed files
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg

 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" \
-    --ignore=tests/models/test_embedding.py \
-    --ignore=tests/models/test_oot_registration.py \
-    --ignore=tests/models/test_registry.py \
-    --ignore=tests/models/test_jamba.py \
-    --ignore=tests/models/test_mamba.py \
-    --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
+  set -e
+  pip install pytest pytest-asyncio \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+  # Embedding models are not supported for CPU yet
+  # pytest -v -s tests/models/embedding/language
+  pytest -v -s tests/models/encoder_decoder/language
+  pytest -v -s tests/models/decoder_only/language/test_models.py
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

 # online inference
 docker exec cpu-test bash -c "
+  set -e
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
  python3 benchmarks/benchmark_serving.py \

--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2

 # offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "
+  set -e
+  python3 examples/offline_inference.py"

 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  set -e
+  pip install pytest pytest-asyncio \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+  # Embedding models are not supported for CPU yet
+  # pytest -v -s tests/models/embedding/language
  pytest -v -s tests/models/encoder_decoder/language
-  pytest -v -s tests/models/decoder_only/language \
-    --ignore=tests/models/test_fp8.py \
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
-    --ignore=tests/models/decoder_only/language/test_mamba.py \
-    --ignore=tests/models/decoder_only/language/test_granitemoe.py \
-    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models/decoder_only/language/test_models.py
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

 # Run compressed-tensor test
 docker exec cpu-test bash -c "
+  set -e
  pytest -s -v \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

 # Run AWQ test
 docker exec cpu-test bash -c "
+  set -e
  pytest -s -v \
  tests/quantization/test_ipex_quant.py"

 # online inference
 docker exec cpu-test bash -c "
+  set -e
  export VLLM_CPU_KVCACHE_SPACE=10 
  export VLLM_CPU_OMP_THREADS_BIND=48-92 
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -269,7 +269,6 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - pip install aiohttp
  - bash run-benchmarks.sh

 - label: Quantization Test # 33min
@@ -331,7 +330,7 @@ steps:
  commands:
    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py

- label: Decoder-only Multi-Modal Models Test (Standard)
+- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,8 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "skip_global_cleanup",
-    "core_model: run this model test in each PR instead of just daily",
+    "core_model: enable this model test in each PR instead of only nightly",
+    "cpu_model: enable this model test in CPU tests",
    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
    "skip_v1: do not run this test with v1",
 ]
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -12,9 +12,7 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
-opencv-python # required for video tests
 peft
-requests
 ray[adag]==2.35
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
@@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test

-# Benchmarking
-aiohttp
-
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9

--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -5,11 +5,11 @@ import pytest
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding

-from tests.utils import RemoteOpenAIServer
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 from ....conftest import HfRunner, VllmRunner
+from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close

 MODEL_NAME = "fixie-ai/ultravox-v0_3"
@@ -39,7 +39,10 @@ def audio(request):
    return AudioAsset(request.param)


-@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+@pytest.fixture(params=[
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def server(request, audio_assets):
    args = [
        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
@@ -185,7 +188,10 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
                num_logprobs: int, vllm_kwargs: dict) -> None:

@@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
                                     max_tokens: int, num_logprobs: int,
                                     vllm_kwargs: dict) -> None:

--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -14,7 +14,6 @@ models = [
    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
    "h2oai/h2ovl-mississippi-2b",
 ]
-target_dtype = "bfloat16"


 def run_preprocessing_test(

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -94,7 +94,7 @@ VLM_TEST_SETTINGS = {
            ),
            limit_mm_per_prompt={"image": 4},
        )],
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    "paligemma": VLMTestInfo(
        models=["google/paligemma-3b-mix-224"],
@@ -111,7 +111,8 @@ VLM_TEST_SETTINGS = {
            "pixel_values"
        ),
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype="half" if current_platform.is_rocm() else ("half", "float"),
+        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
+               else ("half", "float")),
        marks=[pytest.mark.core_model],
    ),
    "qwen2_vl": VLMTestInfo(
@@ -128,7 +129,7 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
    #### Extended model tests
@@ -172,7 +173,6 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
    "glm4": VLMTestInfo(
@@ -245,7 +245,6 @@ VLM_TEST_SETTINGS = {
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
-        dtype="half",
        num_video_frames=16,
        max_model_len=16384,
        postprocess_inputs=model_utils.get_key_type_post_processor(
@@ -404,7 +403,6 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        custom_test_opts=[
@@ -419,7 +417,6 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
-        dtype="half",
        postprocess_inputs=model_utils.get_key_type_post_processor(
            "pixel_values"
        ),

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,


 target_dtype = "half"
-if current_platform.is_cpu():
-    target_dtype = "bfloat16"

 # ROCm Triton FA can run into shared memory issues with these models,
 # use other backends in the meantime

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -5,7 +5,6 @@ import torch

 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
-from vllm.platforms import current_platform
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs

 TokensText = Tuple[List[int], str]
@@ -270,7 +269,7 @@ def build_model_context(model_name: str,
    if tokenizer_name is None:
        tokenizer_name = model_name
    if dtype is None:
-        dtype = "bfloat16" if current_platform.is_cpu() else "half"
+        dtype = "half"

    model_config = ModelConfig(
        model_name,

--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -27,4 +27,4 @@ class ImageAsset:
        """
        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                            s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path)
+        return torch.load(image_path, map_location="cpu")
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
        if sr != feature_extractor.sampling_rate:
            try:
                import librosa
-            except ImportError:
+            except ImportError as exc:
                raise ImportError(
-                    "Please install vllm[audio] for audio support.") from None
+                    "Please install vllm[audio] for audio support.") from exc
            audio = librosa.resample(audio,
                                     orig_sr=sr,
                                     target_sr=feature_extractor.sampling_rate)

--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
    try:
        import librosa
        import soundfile
-    except ImportError:
+    except ImportError as exc:
        raise ImportError(
-            "Please install vllm[audio] for audio support.") from None
+            "Please install vllm[audio] for audio support.") from exc
    return librosa, soundfile


@@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
    try:
        import cv2
        import decord
-    except ImportError:
+    except ImportError as exc:
        raise ImportError(
-            "Please install vllm[video] for video support.") from None
+            "Please install vllm[video] for video support.") from exc
    return cv2, decord



--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -151,7 +151,11 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
            self.local_omp_cpuid = omp_cpuids.split("|")[rank]

        ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
-        if self.model_config.is_encoder_decoder:
+        if self.model_config.task == "embedding":
+            raise NotImplementedError(
+                "Embedding models are not supported for CPU backend")
+            # ModelRunnerClass = CPUEmbeddingModelRunner
+        elif self.model_config.is_encoder_decoder:
            ModelRunnerClass = CPUEncoderDecoderModelRunner
        self.model_runner: CPUModelRunner = ModelRunnerClass(
            vllm_config=vllm_config,