[CI] Add Decode Context Parallelism (DCP) test to CI (#24487)

Signed-off-by: Ming Yang <minos.future@gmail.com>

[CI] Add Decode Context Parallelism (DCP) test to CI (#24487)
Signed-off-by: Ming Yang <minos.future@gmail.com>
4e5affea · Ming Yang · GitHub · e4f0b4cd · 4e5affea · 4e5affea
Unverified Commit 4e5affea authored Sep 16, 2025 by Ming Yang Committed by GitHub Sep 16, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 7 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +14 -3

tests/distributed/test_context_parallel.py tests/distributed/test_context_parallel.py +7 -4

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -946,7 +946,6 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
-  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
 - label: LoRA TP Test (Distributed) # 17 min
  timeout_in_minutes: 30
@@ -1020,9 +1019,21 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: Qwen MoE EP Test # optional
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
  gpu: h200
  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/distributed/test_context_parallel.py
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -71,12 +71,13 @@ class CPTestSettings:
        parallel_setups = []
        for eager_mode_val in [False]:
            for pp_multiplier in [1]:
-                for dcp_multiplier in [2, 4]:
+                for dcp_multiplier in [0.5, 1]:
                    for chunked_prefill_val in [True]:
                        parallel_setups.append(
                            ParallelSetup(tp_size=tp_base,
                                          pp_size=pp_multiplier * pp_base,
-                                          dcp_size=dcp_multiplier * dcp_base,
+                                          dcp_size=int(dcp_multiplier *
+                                                       tp_base),
                                          eager_mode=eager_mode_val,
                                          chunked_prefill=chunked_prefill_val))
        return CPTestSettings(
@@ -223,7 +224,9 @@ def _compare_cp_with_tp(
 CP_TEXT_GENERATION_MODELS = {
    # [MLA attention only]
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat":
+    [CPTestSettings.detailed(),
+     CPTestSettings.detailed(tp_base=2)],
 }
 CP_TEST_MODELS = [
@@ -238,7 +241,7 @@ CP_TEST_MODELS = [
     "runner", "test_options"),
    [
        params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_id)
+        for setting in settings for params in setting.iter_params(model_id)
        if model_id in CP_TEST_MODELS
    ],
 )