Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
34a20c49
Unverified
Commit
34a20c49
authored
Jul 28, 2025
by
Michael Goin
Committed by
GitHub
Jul 28, 2025
Browse files
[Logs] Change flashinfer sampler logs to once (#21759)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
31084b3b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
7 deletions
+8
-7
vllm/v1/sample/ops/topk_topp_sampler.py
vllm/v1/sample/ops/topk_topp_sampler.py
+8
-7
No files found.
vllm/v1/sample/ops/topk_topp_sampler.py
View file @
34a20c49
...
...
@@ -33,7 +33,7 @@ class TopKTopPSampler(nn.Module):
if
is_flashinfer_available
:
flashinfer_version
=
flashinfer
.
__version__
if
flashinfer_version
<
"0.2.3"
:
logger
.
warning
(
logger
.
warning
_once
(
"FlashInfer version >= 0.2.3 required. "
"Falling back to default sampling implementation."
)
self
.
forward
=
self
.
forward_native
...
...
@@ -46,17 +46,18 @@ class TopKTopPSampler(nn.Module):
# None means False, while in V1, None means True. This is
# why we use the condition
# `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
logger
.
info
(
"Using FlashInfer for top-p & top-k sampling."
)
logger
.
info_once
(
"Using FlashInfer for top-p & top-k sampling."
)
self
.
forward
=
self
.
forward_cuda
else
:
logger
.
warning
(
logger
.
warning
_once
(
"FlashInfer is available, but it is not enabled. "
"Falling back to the PyTorch-native implementation of "
"top-p & top-k sampling. For the best performance, "
"please set VLLM_USE_FLASHINFER_SAMPLER=1."
)
self
.
forward
=
self
.
forward_native
else
:
logger
.
warning
(
logger
.
warning
_once
(
"FlashInfer is not available. Falling back to the PyTorch-"
"native implementation of top-p & top-k sampling. For the "
"best performance, please install FlashInfer."
)
...
...
@@ -97,9 +98,9 @@ class TopKTopPSampler(nn.Module):
probs
=
logits
.
softmax
(
dim
=-
1
,
dtype
=
torch
.
float32
)
return
random_sample
(
probs
,
generators
)
if
generators
:
logger
.
warning
(
"FlashInfer 0.2.3+ does not support "
"per-request generators. Falling back to "
"PyTorch-native implementation."
)
logger
.
warning
_once
(
"FlashInfer 0.2.3+ does not support "
"per-request generators. Falling back to "
"PyTorch-native implementation."
)
return
self
.
forward_native
(
logits
,
generators
,
k
,
p
)
# flashinfer sampling functions expect contiguous logits.
# In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment