Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
651e756b
Commit
651e756b
authored
Nov 07, 2025
by
zhuwenwen
Browse files
the prefix cache interface implemented using fa on kme
parent
dc54fefe
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1 addition
and
7 deletions
+1
-7
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+1
-6
vllm/utils/__init__.py
vllm/utils/__init__.py
+0
-1
No files found.
vllm/platforms/rocm.py
View file @
651e756b
...
@@ -16,15 +16,12 @@ from vllm.utils import cuda_device_count_stateless
...
@@ -16,15 +16,12 @@ from vllm.utils import cuda_device_count_stateless
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
vllm.utils
import
is_kme
,
SUPPORT_TC
from
vllm.utils
import
SUPPORT_TC
if
not
SUPPORT_TC
:
if
not
SUPPORT_TC
:
os
.
environ
[
'VLLM_USE_V1'
]
=
'0'
os
.
environ
[
'VLLM_USE_V1'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_MLA'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_MLA'
]
=
'0'
if
is_kme
:
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
VllmConfig
...
@@ -299,8 +296,6 @@ class RocmPlatform(Platform):
...
@@ -299,8 +296,6 @@ class RocmPlatform(Platform):
logger
.
info
(
"flash_attn is not supported on NAVI GPUs."
)
logger
.
info
(
"flash_attn is not supported on NAVI GPUs."
)
else
:
else
:
logger
.
info
(
"%s is not supported in AMD GPUs."
,
selected_backend
)
logger
.
info
(
"%s is not supported in AMD GPUs."
,
selected_backend
)
if
is_kme
:
os
.
environ
[
'VLLM_USE_TRITON_FLASH_ATTN'
]
=
'1'
logger
.
info
(
"Using ROCmFlashAttention backend."
)
logger
.
info
(
"Using ROCmFlashAttention backend."
)
return
"vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"
# noqa: E501
return
"vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"
# noqa: E501
...
...
vllm/utils/__init__.py
View file @
651e756b
...
@@ -85,7 +85,6 @@ POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
...
@@ -85,7 +85,6 @@ POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS
=
5120
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS
=
5120
GPU_ARCH
=
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
GPU_ARCH
=
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
is_kme
=
any
(
arch
in
GPU_ARCH
for
arch
in
[
"gfx928"
])
SUPPORT_TC
=
any
(
arch
in
GPU_ARCH
for
arch
in
[
"gfx928"
,
"gfx936"
])
SUPPORT_TC
=
any
(
arch
in
GPU_ARCH
for
arch
in
[
"gfx928"
,
"gfx936"
])
def
_generate_random_int8
(
def
_generate_random_int8
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment