[CI/Testing] Add basic single node dual batch overlap test (#27235)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>

[CI/Testing] Add basic single node dual batch overlap test (#27235)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
4bc400f4 · Lucas Wilkinson · GitHub · cac4c10e · 4bc400f4 · 4bc400f4
Unverified Commit 4bc400f4 authored Nov 04, 2025 by Lucas Wilkinson Committed by GitHub Nov 03, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 91 additions and 0 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +2 -0

tests/v1/distributed/test_dbo.py tests/v1/distributed/test_dbo.py +89 -0

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1223,6 +1223,7 @@ steps:
    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py  

 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1233,6 +1234,7 @@ steps:
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min

--- a/tests/v1/distributed/test_dbo.py
+++ b/tests/v1/distributed/test_dbo.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism.
+
+DBO is specifically designed for DP+EP scenarios to hide communication latency
+by overlapping computation of two batches. This test validates that DBO works
+correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
+"""
+
+import pytest
+
+from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+DP_SIZE = 2
+
+# GSM8K eval configuration
+NUM_QUESTIONS = 256  # Fast eval for CI; but must be large enough to hit dbo thresholds
+NUM_SHOTS = 5  # Few-shot examples
+MIN_ACCURACY = 0.62  # Expected 0.64 with 2% buffer (based on vLLM test data)
+
+# Increase max_num_seqs to trigger DBO for decode batches
+# With 64 seqs, decode batches should exceed the 32 token threshold
+MAX_NUM_SEQS = 64  # Increased from 16 to trigger decode DBO
+
+# DeepEP backends to test
+DEEPEP_BACKENDS = [
+    "deepep_low_latency",
+    "deepep_high_throughput",
+]
+
+
+@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
+def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
+    """
+    Test DBO with DP+EP using GSM8K evaluation.
+    """
+    required_gpus = DP_SIZE
+
+    if num_gpus_available < required_gpus:
+        pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})")
+
+    # Server arguments for DBO + DP + EP
+    server_args = [
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),  # Use larger batch to trigger decode DBO
+        "--trust-remote-code",
+        # Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching
+        "--data-parallel-size",
+        str(DP_SIZE),
+        "--enable-expert-parallel",
+        "--enable-dbo",
+        # Fix threshold so we know we trigger DBO
+        "--dbo-decode-token-threshold",
+        "16",
+        "--dbo-prefill-token-threshold",
+        "256",
+        "--all2all-backend",
+        all2all_backend,
+    ]
+
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        server_args,
+        max_wait_seconds=600,  # Allow time for model loading with DP+EP
+    ) as remote_server:
+        # Use host and port directly from RemoteOpenAIServer
+        host = f"http://{remote_server.host}"
+        port = remote_server.port
+
+        # Run GSM8K evaluation
+        results = evaluate_gsm8k(
+            num_questions=NUM_QUESTIONS,
+            num_shots=NUM_SHOTS,
+            host=host,
+            port=port,
+        )
+
+        # Validate accuracy is reasonable
+        accuracy = results["accuracy"]
+        assert accuracy >= MIN_ACCURACY, (
+            f"DBO+DP+EP accuracy too low ({all2all_backend}): "
+            f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
+            f"(correct: {results['num_correct']}/{results['num_questions']})"
+        )