Bug/test eagle dp v0 (#38938)

Signed-off-by: Monishver Chandrasekaran <monishverchandrasekaran@gmail.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>

Bug/test eagle dp v0 (#38938)
Signed-off-by: Monishver Chandrasekaran <monishverchandrasekaran@gmail.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
8213e8f8 · Monishver · GitHub · 3693f922 · 8213e8f8 · 8213e8f8
Unverified Commit 8213e8f8 authored Apr 13, 2026 by Monishver Committed by GitHub Apr 13, 2026
3 changed files
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -196,6 +196,7 @@ steps:
    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s tests/v1/distributed/test_eagle_dp.py

 - label: Distributed Tests (2 GPUs)(B200)
  device: b200

--- a/tests/v1/distributed/test_eagle_dp.py
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -20,16 +20,14 @@ if current_platform.is_rocm():
 else:
    ATTN_BACKENDS = ["FLASH_ATTN"]

+# On SM<90 (e.g., L4), batch invariance does not support CUDA graphs.
+# See https://github.com/vllm-project/vllm/pull/30018 and
+# tests/v1/determinism/utils.py for the documented limitation.
+IS_DEVICE_CAPABILITY_BELOW_90 = not current_platform.has_device_capability(90)
+

 @pytest.mark.asyncio
 @pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
-@pytest.mark.xfail(
-    not current_platform.is_rocm(),
-    reason="EAGLE + DP > 1 produces wrong outputs when async spec decode "
-    "correction is active. Root cause under investigation. "
-    "See: https://github.com/vllm-project/vllm/issues/31913",
-    strict=False,
-)
 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="Test may fail on ROCm until batch invariance is enabled. "
@@ -57,7 +55,7 @@ async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch, attn_backend: str):
    engine_args = AsyncEngineArgs(
        model=target_model,
        tokenizer_mode="auto",
-        enforce_eager=False,
+        enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
        tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
        data_parallel_size=DP_SIZE,
        data_parallel_backend="mp",  # ray takes more time

--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -8,6 +8,7 @@ import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter

+import vllm.envs as envs
 from vllm.distributed import (
    divide,
    get_tensor_model_parallel_rank,
@@ -15,6 +16,9 @@ from vllm.distributed import (
    tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.custom_op import PluggableLayer
+from vllm.model_executor.layers.batch_invariant import (
+    linear_batch_invariant,
+)
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig,
    QuantizeMethodBase,
@@ -66,6 +70,8 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
+        if envs.VLLM_BATCH_INVARIANT and current_platform.is_cuda_alike():
+            return linear_batch_invariant(x, layer.weight, bias)
        return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)

    def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor: