[CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)

Signed-off-by: jiang1.li <jiang1.li@intel.com>

[CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
67cee40d · Li, Jiang · GitHub · d99c3a4f · 67cee40d · 67cee40d
Unverified Commit 67cee40d authored Aug 28, 2025 by Li, Jiang Committed by GitHub Aug 28, 2025
Showing with 14 additions and 14 deletions

.buildkite/scripts/hardware_ci/run-cpu-test.sh .buildkite/scripts/hardware_ci/run-cpu-test.sh +10 -10

vllm/model_executor/models/utils.py vllm/model_executor/models/utils.py +4 -4

No files found.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -49,23 +49,23 @@ function cpu_tests() {
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_onednn.py"
+    pytest -x -v -s tests/kernels/test_onednn.py"

  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
-    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

    # Note: disable Bart until supports V1
-    pytest -v -s tests/models/language/generation -m cpu_model \
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py

-    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation \
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
@@ -73,20 +73,20 @@ function cpu_tests() {
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"

  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
+  #   VLLM_USE_V1=0 pytest -x -s -v \
  #   tests/quantization/test_ipex_quant.py"

  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/lora/test_qwen2vl.py"

  # online serving

--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -507,9 +507,9 @@ def merge_multimodal_embeddings(
        This updates ``inputs_embeds`` in place.
    """
    if isinstance(placeholder_token_id, list):
-        placeholder_token_id = torch.tensor(placeholder_token_id,
-                                            pin_memory=True).to(
-                                                device=input_ids.device,
+        placeholder_token_id = torch.tensor(
+            placeholder_token_id,
+            pin_memory=is_pin_memory_available()).to(device=input_ids.device,
                                                     non_blocking=True)
        return _merge_multimodal_embeddings(
            inputs_embeds,