Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1217257c
Commit
1217257c
authored
Dec 25, 2025
by
zhuwenwen
Browse files
fix run error
parent
8301427e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
20 additions
and
20 deletions
+20
-20
vllm/attention/ops/flashmla.py
vllm/attention/ops/flashmla.py
+4
-2
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+0
-2
vllm/model_executor/layers/rotary_embedding/__init__.py
vllm/model_executor/layers/rotary_embedding/__init__.py
+10
-10
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+3
-4
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+1
-0
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+2
-2
No files found.
vllm/attention/ops/flashmla.py
View file @
1217257c
...
@@ -226,14 +226,16 @@ def flash_mla_with_kvcache(
...
@@ -226,14 +226,16 @@ def flash_mla_with_kvcache(
out
,
softmax_lse
=
flash_mla_cuda
.
fwd_kvcache_mla
(
out
,
softmax_lse
=
flash_mla_cuda
.
fwd_kvcache_mla
(
q
,
q
,
k_cache
,
k_cache
,
None
,
head_dim_v
,
head_dim_v
,
cache_seqlens
,
cache_seqlens
,
block_table
,
block_table
,
softmax_scale
,
softmax_scale
,
causal
,
causal
,
tile_scheduler_metadata
,
tile_scheduler_metadata
,
num_splits
)
num_splits
,
is_fp8_kvcache
,
indices
,
)
else
:
else
:
out
,
softmax_lse
=
torch
.
ops
.
_flashmla_C
.
fwd_kvcache_mla
(
out
,
softmax_lse
=
torch
.
ops
.
_flashmla_C
.
fwd_kvcache_mla
(
q
,
q
,
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
1217257c
...
@@ -2062,8 +2062,6 @@ class FusedMoE(CustomOp):
...
@@ -2062,8 +2062,6 @@ class FusedMoE(CustomOp):
router_logits
=
router_logits
,
router_logits
=
router_logits
,
use_nn_moe
=
self
.
use_nn_moe
,
use_nn_moe
=
self
.
use_nn_moe
,
use_fused_gate
=
self
.
use_fused_gate
,
use_fused_gate
=
self
.
use_fused_gate
,
use_nn_moe
=
self
.
use_nn_moe
,
use_fused_gate
=
self
.
use_fused_gate
,
i_q
=
i_q
,
i_q
=
i_q
,
i_s
=
i_s
,
i_s
=
i_s
,
)
)
...
...
vllm/model_executor/layers/rotary_embedding/__init__.py
View file @
1217257c
vllm/platforms/rocm.py
View file @
1217257c
...
@@ -228,11 +228,10 @@ class RocmPlatform(Platform):
...
@@ -228,11 +228,10 @@ class RocmPlatform(Platform):
logger
.
info_once
(
"Using Sparse MLA backend on V1 engine."
)
logger
.
info_once
(
"Using Sparse MLA backend on V1 engine."
)
return
AttentionBackendEnum
.
ROCM_AITER_MLA_SPARSE
.
get_path
()
return
AttentionBackendEnum
.
ROCM_AITER_MLA_SPARSE
.
get_path
()
if
use_mla
:
if
attn_selector_config
.
use_mla
:
# if use_sparse:
# if
attn_selector_config.
use_sparse:
# logger.info_once("Using Sparse MLA backend on V1 engine.")
# logger.info_once("Using Sparse MLA backend on V1 engine.")
# return ("vllm.v1.attention.backends.mla.flashmla_sparse."
# return AttentionBackendEnum.FLASHMLA_SPARSE.get_path()
# "FlashMLASparseBackend")
use_flashmla
=
selected_backend
==
AttentionBackendEnum
.
FLASHMLA
or
envs
.
VLLM_USE_FLASH_MLA
use_flashmla
=
selected_backend
==
AttentionBackendEnum
.
FLASHMLA
or
envs
.
VLLM_USE_FLASH_MLA
use_triton
=
selected_backend
==
AttentionBackendEnum
.
TRITON_MLA
or
(
use_triton
=
selected_backend
==
AttentionBackendEnum
.
TRITON_MLA
or
(
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
1217257c
...
@@ -56,6 +56,7 @@ from vllm.v1.attention.backends.utils import (
...
@@ -56,6 +56,7 @@ from vllm.v1.attention.backends.utils import (
get_kv_cache_layout
,
get_kv_cache_layout
,
)
)
from
vllm.v1.kv_cache_interface
import
AttentionSpec
from
vllm.v1.kv_cache_interface
import
AttentionSpec
import
vllm.envs
as
envs
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
...
vllm/v1/worker/gpu_worker.py
View file @
1217257c
...
@@ -81,8 +81,8 @@ class Worker(WorkerBase):
...
@@ -81,8 +81,8 @@ class Worker(WorkerBase):
)
)
# configure float32 matmul precision according to vLLM env.
# configure float32 matmul precision according to vLLM env.
precision
=
envs
.
VLLM_FLOAT32_MATMUL_PRECISION
#
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch
.
backends
.
cuda
.
matmul
.
fp32_precision
=
precision
#
torch.backends.cuda.matmul.fp32_precision = precision
if
self
.
model_config
.
trust_remote_code
:
if
self
.
model_config
.
trust_remote_code
:
# note: lazy import to avoid importing torch before initializing
# note: lazy import to avoid importing torch before initializing
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment