Disable FlashInfer sampler by default (#26859)

Signed-off-by: mgoin <mgoin64@gmail.com>

Disable FlashInfer sampler by default (#26859)
Signed-off-by: mgoin <mgoin64@gmail.com>
e66d787b · Michael Goin · GitHub · bfad142e · e66d787b
Unverified Commit e66d787b authored Oct 14, 2025 by Michael Goin Committed by GitHub Oct 15, 2025
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 14 deletions

vllm/v1/sample/ops/topk_topp_sampler.py vllm/v1/sample/ops/topk_topp_sampler.py +6 -14

No files found.
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -46,23 +46,15 @@ class TopKTopPSampler(nn.Module):
                        "Falling back to default sampling implementation."
                    )
                    self.forward = self.forward_native
-                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                elif envs.VLLM_USE_FLASHINFER_SAMPLER:
-                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
+                    # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
-                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
-                    # default it is unused). For backward compatibility, we set
-                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
-                    # interpret it differently in V0 and V1 samplers: In V0,
-                    # None means False, while in V1, None means True. This is
-                    # why we use the condition
-                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
                    logger.info_once("Using FlashInfer for top-p & top-k sampling.")
                    self.forward = self.forward_cuda
                else:
-                    logger.warning_once(
+                    logger.debug_once(
-                        "FlashInfer is available, but it is not enabled. "
+                        "FlashInfer top-p/top-k sampling is available but disabled "
-                        "Falling back to the PyTorch-native implementation of "
+                        "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in "
-                        "top-p & top-k sampling. For the best performance, "
+                        "after verifying accuracy for your workloads."
-                        "please set VLLM_USE_FLASHINFER_SAMPLER=1."
                    )
                    self.forward = self.forward_native
            else: