Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f137e58c
Commit
f137e58c
authored
Jul 31, 2025
by
zhuwenwen
Browse files
update List[int] and update num_rejected_tokens
parent
1b78ef29
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
4 deletions
+4
-4
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+3
-3
vllm/v1/attention/backends/utils.py
vllm/v1/attention/backends/utils.py
+1
-1
No files found.
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
f137e58c
...
...
@@ -659,7 +659,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
per_channel_quant
:
bool
,
block_shape
:
Optional
[
l
ist
[
int
]]
=
None
,
block_shape
:
Optional
[
L
ist
[
int
]]
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
)
->
None
:
assert
topk_weights
is
not
None
or
not
mul_routed_weight
assert
topk_weights
is
None
or
topk_weights
.
stride
(
1
)
==
1
...
...
@@ -1328,7 +1328,7 @@ def flashinfer_fused_moe_blockscale_fp8(
intermediate_size
:
int
,
expert_offset
:
int
,
local_num_experts
:
int
,
block_shape
:
l
ist
[
int
],
block_shape
:
L
ist
[
int
],
routed_scaling
:
float
=
1.0
)
->
torch
.
Tensor
:
from
vllm.utils.flashinfer
import
flashinfer_trtllm_fp8_block_scale_moe
assert
top_k
<=
global_num_experts
...
...
@@ -1381,7 +1381,7 @@ def flashinfer_fused_moe_blockscale_fp8_fake(
intermediate_size
:
int
,
expert_offset
:
int
,
local_num_experts
:
int
,
block_shape
:
l
ist
[
int
],
block_shape
:
L
ist
[
int
],
routed_scaling
:
float
=
1.0
)
->
torch
.
Tensor
:
return
torch
.
empty_like
(
x
)
...
...
vllm/v1/attention/backends/utils.py
View file @
f137e58c
...
...
@@ -55,7 +55,7 @@ class CommonAttentionMetadata:
"""Total number of tokens in batch"""
max_query_len
:
int
"""Longest query in batch"""
num_rejected_tokens
:
list
[
int
]
=
None
num_rejected_tokens
:
list
[
int
]
"""(batch_size,), record the rejected tokens number in cpu and gpu"""
block_table_tensor
:
torch
.
Tensor
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment