[All Reduce] Change default backend of Flashinfer All Reduce to trtllm (#35793)

Signed-off-by: hjjq <hanjieq@nvidia.com>

[All Reduce] Change default backend of Flashinfer All Reduce to trtllm (#35793)
Signed-off-by: hjjq <hanjieq@nvidia.com>
96fc0950 · Hanjie Qiu · GitHub · 1b82b433 · 96fc0950
Unverified Commit 96fc0950 authored Mar 02, 2026 by Hanjie Qiu Committed by GitHub Mar 02, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

vllm/envs.py vllm/envs.py +5 -2

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -168,7 +168,7 @@ if TYPE_CHECKING:
    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
        "latency"
    )
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "auto"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "trtllm"
    VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
    VLLM_XGRAMMAR_CACHE_MB: int = 0
    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
@@ -1297,9 +1297,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Flashinfer fused allreduce backend.
    # "auto" will default to "mnnvl", which performs mostly same/better than "trtllm".
    # But "mnnvl" backend does not support fuse with quantization.
+    # TODO: Default is "trtllm" right now because "mnnvl" has issues with cudagraph:
+    # https://github.com/vllm-project/vllm/issues/35772
+    # Should switch back to "auto" if the issue is resolved.
    "VLLM_FLASHINFER_ALLREDUCE_BACKEND": env_with_choices(
        "VLLM_FLASHINFER_ALLREDUCE_BACKEND",
-        "auto",
+        "trtllm",
        ["auto", "trtllm", "mnnvl"],
    ),
    # Control the workspace buffer size for the FlashInfer backend.