[Logs] Change flashinfer sampler logs to once (#21759)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Logs] Change flashinfer sampler logs to once (#21759)
Signed-off-by: mgoin <mgoin64@gmail.com>
34a20c49 · Michael Goin · GitHub · 31084b3b · 34a20c49
Unverified Commit 34a20c49 authored Jul 28, 2025 by Michael Goin Committed by GitHub Jul 28, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 7 deletions

vllm/v1/sample/ops/topk_topp_sampler.py vllm/v1/sample/ops/topk_topp_sampler.py +8 -7

No files found.
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -33,7 +33,7 @@ class TopKTopPSampler(nn.Module):
            if is_flashinfer_available:
                flashinfer_version = flashinfer.__version__
                if flashinfer_version < "0.2.3":
-                    logger.warning(
+                    logger.warning_once(
                        "FlashInfer version >= 0.2.3 required. "
                        "Falling back to default sampling implementation.")
                    self.forward = self.forward_native
@@ -46,17 +46,18 @@ class TopKTopPSampler(nn.Module):
                    # None means False, while in V1, None means True. This is
                    # why we use the condition
                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
-                    logger.info("Using FlashInfer for top-p & top-k sampling.")
+                    logger.info_once(
+                        "Using FlashInfer for top-p & top-k sampling.")
                    self.forward = self.forward_cuda
                else:
-                    logger.warning(
+                    logger.warning_once(
                        "FlashInfer is available, but it is not enabled. "
                        "Falling back to the PyTorch-native implementation of "
                        "top-p & top-k sampling. For the best performance, "
                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
                    self.forward = self.forward_native
            else:
-                logger.warning(
+                logger.warning_once(
                    "FlashInfer is not available. Falling back to the PyTorch-"
                    "native implementation of top-p & top-k sampling. For the "
                    "best performance, please install FlashInfer.")
@@ -97,9 +98,9 @@ class TopKTopPSampler(nn.Module):
            probs = logits.softmax(dim=-1, dtype=torch.float32)
            return random_sample(probs, generators)
        if generators:
-            logger.warning("FlashInfer 0.2.3+ does not support "
+            logger.warning_once("FlashInfer 0.2.3+ does not support "
-                           "per-request generators. Falling back to "
+                                "per-request generators. Falling back to "
-                           "PyTorch-native implementation.")
+                                "PyTorch-native implementation.")
            return self.forward_native(logits, generators, k, p)
        # flashinfer sampling functions expect contiguous logits.
        # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous