[CI/Build] Improve stability of CPU tests (#39966)

Signed-off-by: jiang1.li <jiang1.li@intel.com>

[CI/Build] Improve stability of CPU tests (#39966)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
324a3d2b · Li, Jiang · GitHub · 4269b794 · 324a3d2b · 324a3d2b
Unverified Commit 324a3d2b authored Apr 16, 2026 by Li, Jiang Committed by GitHub Apr 16, 2026
4 changed files
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -46,7 +46,7 @@ steps:
  - tests/models/language/pooling/
  commands:
    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 40m "
      pytest -x -v -s tests/models/language/generation -m cpu_model
      pytest -x -v -s tests/models/language/pooling -m cpu_model"

@@ -99,7 +99,7 @@ steps:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
-  parallelism: 2
+  parallelism: 3

 - label: "Arm CPU Test"
  depends_on: []

--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -100,7 +100,7 @@ AITER_MODEL_LIST = [
        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
        pytest.param(
            "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
        ),
        pytest.param("swiss-ai/Apertus-8B-Instruct-2509"),  # apertus
        pytest.param(
@@ -143,9 +143,9 @@ def test_models(
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")

-    if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
-        # This untrained model is sensitive to the rounding error
-        # Fuse ops to reduce bfloat16 rounding
+    if current_platform.is_cpu() and model in ("openai-community/gpt2",):
+        # These models are sensitive to the rounding error
+        # Fuse ops to reduce rounding
        monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")

    with hf_runner(model) as hf_model:

--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -15,6 +15,7 @@ MODELS = [
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.cpu_model
 def test_models(
    hf_runner,
    vllm_runner,

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -242,6 +242,7 @@ class CpuPlatform(Platform):
                    "cpp.dynamic_threads": True,
                }
            )
+            compilation_config.ir_enable_torch_wrap = False

        if vllm_config.lora_config is not None:
            compilation_config.mode = CompilationMode.NONE