Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
319506a5
Commit
319506a5
authored
Mar 02, 2026
by
zhuwenwen
Browse files
Merge branch 'v0.15.1-dev' of
http://10.16.6.30/dcutoolkit/deeplearing/vllm
into v0.15.1-dev
parents
afe3ea1e
c334b741
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
134 additions
and
9 deletions
+134
-9
vllm/envs.py
vllm/envs.py
+14
-0
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/layernorm.py
+7
-3
vllm/v1/sample/metadata.py
vllm/v1/sample/metadata.py
+4
-0
vllm/v1/sample/ops/topk_topp_sampler.py
vllm/v1/sample/ops/topk_topp_sampler.py
+73
-0
vllm/v1/sample/sampler.py
vllm/v1/sample/sampler.py
+24
-6
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+12
-0
No files found.
vllm/envs.py
View file @
319506a5
...
...
@@ -301,6 +301,7 @@ if TYPE_CHECKING:
VLLM_REJECT_SAMPLE_OPT
:
bool
=
False
VLLM_USE_MOE_W16A16_TRITON
:
bool
=
False
VLLM_V1_FAST_TOKEN_ID_COPY
:
bool
=
False
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER
:
bool
=
False
def
get_default_cache_root
():
...
...
@@ -1883,6 +1884,19 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_V1_FAST_TOKEN_ID_COPY"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_V1_FAST_TOKEN_ID_COPY"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# If set to 1/True, enable reduced top-k/top-p sampling fast path in the
# V1 PyTorch-native sampler path.
#
# Recommended when both top_k is enabled and top_p < 1.0 (nucleus
# sampling). Not recommended for top-k only (top_p == 1.0) due to
# potential behavior differences when the k-th logit is tied.
"VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)
),
}
# --8<-- [end:env-vars-definition]
...
...
vllm/model_executor/layers/layernorm.py
View file @
319506a5
...
...
@@ -458,6 +458,10 @@ class RMSNormGated(CustomOp):
- norm_before_gate=True: out = norm(x) * silu(z)
- norm_before_gate=False: out = norm(x * silu(z))
"""
orig_dtype
=
x
.
dtype
x
=
x
.
float
()
weight
=
self
.
weight
.
float
()
z
=
z
.
float
()
if
z
is
not
None
else
None
# Apply gating before normalization if needed
if
z
is
not
None
and
not
self
.
norm_before_gate
:
x
=
x
*
F
.
silu
(
z
)
...
...
@@ -467,7 +471,7 @@ class RMSNormGated(CustomOp):
# Standard RMS norm across the last dimension
variance
=
x
.
pow
(
2
).
mean
(
dim
=-
1
,
keepdim
=
True
)
x_normed
=
x
*
torch
.
rsqrt
(
variance
+
self
.
eps
)
out
=
x_normed
*
self
.
weight
out
=
x_normed
*
weight
else
:
# Group RMS norm
from
einops
import
rearrange
...
...
@@ -475,13 +479,13 @@ class RMSNormGated(CustomOp):
x_group
=
rearrange
(
x
,
"... (g d) -> ... g d"
,
d
=
self
.
group_size
)
variance
=
x_group
.
pow
(
2
).
mean
(
dim
=-
1
,
keepdim
=
True
)
x_normed
=
x_group
*
torch
.
rsqrt
(
variance
+
self
.
eps
)
out
=
rearrange
(
x_normed
,
"... g d -> ... (g d)"
)
*
self
.
weight
out
=
rearrange
(
x_normed
,
"... g d -> ... (g d)"
)
*
weight
# Apply gating after normalization if needed
if
z
is
not
None
and
self
.
norm_before_gate
:
out
=
out
*
F
.
silu
(
z
)
return
out
return
out
.
to
(
orig_dtype
)
def
forward_cuda
(
self
,
x
:
torch
.
Tensor
,
z
:
torch
.
Tensor
|
None
=
None
...
...
vllm/v1/sample/metadata.py
View file @
319506a5
...
...
@@ -40,5 +40,9 @@ class SamplingMetadata:
# Loaded logits processors
logitsprocs
:
LogitsProcessors
# Optional host-side summaries for top-k fast paths.
max_top_k
:
int
|
None
=
None
has_any_no_top_k
:
bool
=
False
# Speculative token ids
spec_token_ids
:
list
[
list
[
int
]]
|
None
=
None
vllm/v1/sample/ops/topk_topp_sampler.py
View file @
319506a5
...
...
@@ -95,12 +95,44 @@ class TopKTopPSampler(nn.Module):
generators
:
dict
[
int
,
torch
.
Generator
],
k
:
torch
.
Tensor
|
None
,
p
:
torch
.
Tensor
|
None
,
*
,
max_top_k
:
int
|
None
=
None
,
has_any_no_top_k
:
bool
=
False
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
|
None
]:
"""
PyTorch-native implementation of top-k and top-p sampling.
The logits tensor may be updated in-place.
"""
# Fast path: when top-k is enabled, avoid full-vocab sort/softmax by
# sampling from only the reduced candidate set.
if
(
self
.
logprobs_mode
not
in
(
"processed_logits"
,
"processed_logprobs"
)
and
envs
.
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER
and
k
is
not
None
and
p
is
not
None
and
max_top_k
is
not
None
and
not
has_any_no_top_k
and
max_top_k
<=
4096
):
try
:
return
(
sample_top_k_top_p_reduced
(
logits
,
generators
,
k
,
p
,
max_top_k
=
max_top_k
,
),
None
,
)
except
Exception
:
# Fall back to the reference implementation for safety.
logger
.
debug_once
(
"Reduced top-k/top-p sampler failed; falling back to the "
"reference implementation."
)
logits
=
self
.
apply_top_k_top_p
(
logits
,
k
,
p
)
logits_to_return
=
None
if
self
.
logprobs_mode
==
"processed_logits"
:
...
...
@@ -332,6 +364,47 @@ def random_sample(
return
probs
.
div_
(
q
).
argmax
(
dim
=-
1
).
view
(
-
1
)
def
sample_top_k_top_p_reduced
(
logits
:
torch
.
Tensor
,
generators
:
dict
[
int
,
torch
.
Generator
],
k
:
torch
.
Tensor
,
p
:
torch
.
Tensor
|
None
,
*
,
max_top_k
:
int
,
)
->
torch
.
Tensor
:
"""Sample logits from only the top-k candidate set."""
vocab_size
=
logits
.
shape
[
-
1
]
# Guard for extreme values that can defeat the purpose of this fast path.
if
max_top_k
<=
0
or
max_top_k
>=
vocab_size
:
masked_logits
=
apply_top_k_top_p
(
logits
,
k
,
p
)
probs
=
masked_logits
.
softmax
(
dim
=-
1
,
dtype
=
torch
.
float32
)
return
random_sample
(
probs
,
generators
)
topk
=
logits
.
topk
(
max_top_k
,
dim
=-
1
)
topk_logits
=
topk
.
values
topk_indices
=
topk
.
indices
# Apply per-row top-k on the reduced candidate set.
k
=
k
.
to
(
torch
.
long
)
arange_k
=
torch
.
arange
(
max_top_k
,
device
=
logits
.
device
).
unsqueeze
(
0
)
keep_k
=
arange_k
<
k
.
unsqueeze
(
1
)
topk_logits
=
topk_logits
.
masked_fill
(
~
keep_k
,
-
float
(
"inf"
))
# Convert to probabilities over the reduced candidate set.
probs
=
topk_logits
.
softmax
(
dim
=-
1
,
dtype
=
torch
.
float32
)
if
p
is
not
None
:
# Apply top-p in descending-logit order within the reduced set.
cumprob
=
torch
.
cumsum
(
probs
,
dim
=-
1
)
cumprob_prev
=
cumprob
-
probs
keep_p
=
cumprob_prev
<
p
.
unsqueeze
(
1
)
probs
=
probs
*
keep_p
# Sample position in reduced set, then map back to vocab ids.
pos
=
random_sample
(
probs
,
generators
)
return
topk_indices
.
gather
(
1
,
pos
.
unsqueeze
(
1
)).
squeeze
(
1
)
def
flashinfer_sample
(
logits
:
torch
.
Tensor
,
k
:
torch
.
Tensor
|
None
,
...
...
vllm/v1/sample/sampler.py
View file @
319506a5
...
...
@@ -5,6 +5,7 @@
import
torch
import
torch.nn
as
nn
from
vllm
import
envs
from
vllm.config.model
import
LogprobsMode
from
vllm.utils.platform_utils
import
is_pin_memory_available
from
vllm.v1.outputs
import
LogprobsTensors
,
SamplerOutput
...
...
@@ -184,6 +185,23 @@ class Sampler(nn.Module):
logits
=
processor
.
apply
(
logits
)
# Apply top_k and/or top_p.
if
(
envs
.
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER
and
sampling_metadata
.
top_k
is
not
None
and
sampling_metadata
.
top_p
is
not
None
and
sampling_metadata
.
max_top_k
is
not
None
and
not
sampling_metadata
.
has_any_no_top_k
and
self
.
topk_topp_sampler
.
forward
.
__name__
==
"forward_native"
):
random_sampled
,
processed_logprobs
=
self
.
topk_topp_sampler
(
logits
,
sampling_metadata
.
generators
,
sampling_metadata
.
top_k
,
sampling_metadata
.
top_p
,
max_top_k
=
sampling_metadata
.
max_top_k
,
has_any_no_top_k
=
sampling_metadata
.
has_any_no_top_k
,
)
else
:
random_sampled
,
processed_logprobs
=
self
.
topk_topp_sampler
(
logits
,
sampling_metadata
.
generators
,
...
...
vllm/v1/worker/gpu_input_batch.py
View file @
319506a5
...
...
@@ -812,6 +812,16 @@ class InputBatch:
def
_make_sampling_metadata
(
self
,
repeat_counts
:
Optional
[
torch
.
Tensor
]
=
None
)
->
SamplingMetadata
:
num_reqs
=
self
.
num_reqs
# Host-side summaries for reduced top-k/top-p sampling.
# Compute before copy_slice(top_k), which may rewrite top_k_cpu_tensor
# when repeat_counts is provided.
max_top_k
=
None
has_any_no_top_k
=
False
if
not
self
.
no_top_k
and
num_reqs
>
0
:
top_k_cpu
=
self
.
top_k_cpu
[:
num_reqs
]
max_top_k
=
int
(
top_k_cpu
.
max
())
has_any_no_top_k
=
bool
((
top_k_cpu
==
self
.
vocab_size
).
any
())
if
not
self
.
all_greedy
:
temperature
=
copy_slice
(
self
.
temperature_cpu_tensor
,
self
.
temperature
,
...
...
@@ -889,6 +899,8 @@ class InputBatch:
allowed_token_ids_mask
=
allowed_token_ids_mask
,
bad_words_token_ids
=
self
.
bad_words_token_ids
,
logitsprocs
=
self
.
logitsprocs
,
max_top_k
=
max_top_k
,
has_any_no_top_k
=
has_any_no_top_k
,
)
def
get_pooling_params
(
self
)
->
list
[
PoolingParams
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment