[CI][DCP][Perf] reduce DCP CI execution time (#29858)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>

[CI][DCP][Perf] reduce DCP CI execution time (#29858)
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
46cbbca0 · Qiu · GitHub · b286a311 · 46cbbca0 · 46cbbca0
Unverified Commit 46cbbca0 authored Dec 05, 2025 by Qiu Committed by GitHub Dec 04, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 100 additions and 94 deletions

tests/distributed/test_context_parallel.py tests/distributed/test_context_parallel.py +95 -93

tests/models/registry.py tests/models/registry.py +5 -1

No files found.
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -16,16 +16,35 @@ from typing import Literal, NamedTuple
 import pytest
 import torch
+from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import compare_two_settings, create_new_process_for_each_test
 logger = init_logger("test_context_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+CP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+]
+# GSM8K eval configuration
+NUM_QUESTIONS = 256  # Fast eval for CI
+NUM_SHOTS = 5  # Few-shot examples
+# tp accuracy with 2% buffer
+MIN_ACCURACY = {
+    # .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64,
+    # .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+    "Qwen/Qwen2.5-1.5B-Instruct": 0.52,
+}
 class ParallelSetup(NamedTuple):
    tp_size: int
@@ -38,7 +57,6 @@ class ParallelSetup(NamedTuple):
 class CPTestOptions(NamedTuple):
    multi_node_only: bool
-    load_format: str | None = None
    attn_backend: str | None = None
@@ -54,17 +72,20 @@ class CPTestSettings:
        *,
        tp_base: int = 4,
        pp_base: int = 1,
-        dcp_base: int = 1,
+        dcp_multipliers: list[float] | None = None,
        cp_kv_cache_interleave_size: int = 1,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
-        load_format: str | None = None,
        attn_backend: str | None = None,
    ):
        parallel_setups = []
+        if dcp_multipliers is None:
+            dcp_multipliers = [
+                0.5,
+            ]
        for eager_mode_val in [False]:
            for pp_multiplier in [1]:
-                for dcp_multiplier in [0.5, 1]:
+                for dcp_multiplier in dcp_multipliers:
                    for chunked_prefill_val in [True]:
                        parallel_setups.append(
                            ParallelSetup(
@@ -82,7 +103,6 @@ class CPTestSettings:
            runner=runner,
            test_options=CPTestOptions(
                multi_node_only=multi_node_only,
-                load_format=load_format,
                attn_backend=attn_backend,
            ),
        )
@@ -101,7 +121,24 @@ class CPTestSettings:
                )
-def _compare_cp_with_tp(
+CP_TEXT_GENERATION_MODELS = {
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        CPTestSettings.detailed(
+            dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64
+        ),
+    ],
+    "Qwen/Qwen2.5-1.5B-Instruct": [
+        CPTestSettings.detailed(
+            cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN"
+        ),
+        CPTestSettings.detailed(
+            cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER"
+        ),
+    ],
+}
+def _test_cp_gsm8k(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
@@ -121,7 +158,7 @@ def _compare_cp_with_tp(
        chunked_prefill,
    ) = parallel_setup
-    multi_node_only, load_format, attn_backend = test_options
+    multi_node_only, attn_backend = test_options
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
    model_info.check_transformers_version(on_fail="skip")
@@ -130,22 +167,7 @@ def _compare_cp_with_tp(
    tokenizer_mode = model_info.tokenizer_mode
    hf_overrides = model_info.hf_overrides
-    if load_format == "dummy":
+    model_info.check_available_online(on_fail="skip")
-        # Avoid OOM
-        text_overrides = {
-            "num_hidden_layers": 4,
-            "hidden_size": 512,
-            "intermediate_size": 800,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 1,
-        }
-        if is_multimodal:
-            hf_overrides.update({"text_config": text_overrides})
-        else:
-            hf_overrides.update(text_overrides)
-    else:
-        model_info.check_available_online(on_fail="skip")
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@@ -157,90 +179,70 @@ def _compare_cp_with_tp(
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")
-    common_args = [
+    server_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
-        "2048",
+        "4096",
        "--max-num-seqs",
-        "8",
+        "64",
    ]
    if chunked_prefill:
-        common_args.append("--enable-chunked-prefill")
+        server_args.append("--enable-chunked-prefill")
    if eager_mode:
-        common_args.append("--enforce-eager")
+        server_args.append("--enforce-eager")
    if runner != "auto":
-        common_args.extend(["--runner", runner])
+        server_args.extend(["--runner", runner])
    if trust_remote_code:
-        common_args.append("--trust-remote-code")
+        server_args.append("--trust-remote-code")
    if tokenizer_mode:
-        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+        server_args.extend(["--tokenizer-mode", tokenizer_mode])
-    if load_format:
-        common_args.extend(["--load-format", load_format])
    if hf_overrides:
-        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+        server_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
-    if not attn_backend:
+    server_args.extend(
-        cp_env = tp_env = {}
+        [
-    else:
+            "--tensor-parallel-size",
-        cp_env = tp_env = {
+            str(tp_size),
-            "VLLM_ATTENTION_BACKEND": attn_backend,
+            "--pipeline-parallel-size",
-        }
+            str(pp_size),
+            "--decode-context-parallel-size",
-    cp_args = [
+            str(dcp_size),
-        *common_args,
+            "--dcp-kv-cache-interleave-size",
-        "--tensor-parallel-size",
+            str(cp_kv_cache_interleave_size),
-        str(tp_size),
+            "--distributed-executor-backend",
-        "--pipeline-parallel-size",
+            distributed_backend,
-        str(pp_size),
+        ]
-        "--decode-context-parallel-size",
+    )
-        str(dcp_size),
-        "--dcp-kv-cache-interleave-size",
-        str(cp_kv_cache_interleave_size),
-        "--distributed-executor-backend",
-        distributed_backend,
-    ]
-    tp_args = [
+    server_env = {}
-        *common_args,
+    if attn_backend:
-        "--tensor-parallel-size",
+        server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-        "--distributed-executor-backend",
-        distributed_backend,
-    ]
-    compare_two_settings(
+    with RemoteOpenAIServer(
        model_id,
-        cp_args,
+        server_args,
-        tp_args,
+        env_dict=server_env,
-        cp_env,
-        tp_env,
-        method=method,
        max_wait_seconds=720,
-    )
+    ) as remote_server:
+        host = f"http://{remote_server.host}"
+        port = remote_server.port
-CP_TEXT_GENERATION_MODELS = {
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        # Run GSM8K evaluation
-        CPTestSettings.detailed(),
+        results = evaluate_gsm8k(
-        CPTestSettings.detailed(tp_base=2),
+            num_questions=NUM_QUESTIONS,
-        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
+            num_shots=NUM_SHOTS,
-    ],
+            host=host,
-    "bigcode/gpt_bigcode-santacoder": [
+            port=port,
-        CPTestSettings.detailed(),
+        )
-        CPTestSettings.detailed(tp_base=2),
-    ],
-}
-CP_TEST_MODELS = [
+        # Validate accuracy is reasonable
-    # TODO support other models
+        accuracy = results["accuracy"]
-    # [LANGUAGE GENERATION]
+        min_accuracy = MIN_ACCURACY[model_id]
-    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        assert accuracy >= min_accuracy, (
-    "bigcode/gpt_bigcode-santacoder",
+            f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}"
-]
+        )
 @pytest.mark.parametrize(
@@ -274,12 +276,12 @@ def test_cp_generation(
    ):
        pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
    if (
-        model_id == "bigcode/gpt_bigcode-santacoder"
+        model_id == "Qwen/Qwen2.5-1.5B-Instruct"
        and torch.cuda.get_device_capability() != (9, 0)
    ):
        pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
-    _compare_cp_with_tp(
+    _test_cp_gsm8k(
        model_id,
        parallel_setup,
        distributed_backend,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        trust_remote_code=True,
    ),
    "Qwen2ForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}
+        "Qwen/Qwen2-0.5B-Instruct",
+        extras={
+            "2.5": "Qwen/Qwen2.5-0.5B-Instruct",
+            "2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
+        },
    ),
    "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),