Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
61445453
Unverified
Commit
61445453
authored
Jul 30, 2025
by
Michael Goin
Committed by
GitHub
Jul 30, 2025
Browse files
[UX] Rename CUTLASS_MLA_VLLM_V1 to CUTLASS_MLA (#21966)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
ec02e536
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
8 additions
and
8 deletions
+8
-8
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+5
-5
vllm/platforms/interface.py
vllm/platforms/interface.py
+1
-1
vllm/v1/attention/backends/mla/cutlass_mla.py
vllm/v1/attention/backends/mla/cutlass_mla.py
+1
-1
No files found.
vllm/engine/arg_utils.py
View file @
61445453
...
@@ -1417,7 +1417,7 @@ class EngineArgs:
...
@@ -1417,7 +1417,7 @@ class EngineArgs:
"PALLAS_VLLM_V1"
,
"PALLAS_VLLM_V1"
,
"TRITON_ATTN_VLLM_V1"
,
"TRITON_ATTN_VLLM_V1"
,
"TRITON_MLA"
,
"TRITON_MLA"
,
"CUTLASS_MLA
_VLLM_V1
"
,
"CUTLASS_MLA"
,
"FLASHMLA"
,
"FLASHMLA"
,
"FLASHINFER"
,
"FLASHINFER"
,
"FLASHINFER_VLLM_V1"
,
"FLASHINFER_VLLM_V1"
,
...
...
vllm/platforms/cuda.py
View file @
61445453
...
@@ -162,7 +162,7 @@ class CudaPlatformBase(Platform):
...
@@ -162,7 +162,7 @@ class CudaPlatformBase(Platform):
if
cls
.
is_device_capability
(
100
):
if
cls
.
is_device_capability
(
100
):
# Blackwell => Force CutlassMLA.
# Blackwell => Force CutlassMLA.
use_cutlass_mla
=
True
use_cutlass_mla
=
True
envs
.
VLLM_ATTENTION_BACKEND
=
"CUTLASS_MLA
_VLLM_V1
"
envs
.
VLLM_ATTENTION_BACKEND
=
"CUTLASS_MLA"
else
:
else
:
# Not Blackwell
# Not Blackwell
use_flashmla
=
True
use_flashmla
=
True
...
@@ -170,7 +170,7 @@ class CudaPlatformBase(Platform):
...
@@ -170,7 +170,7 @@ class CudaPlatformBase(Platform):
# Forced case
# Forced case
use_flashmla
=
(
envs
.
VLLM_ATTENTION_BACKEND
==
"FLASHMLA"
)
use_flashmla
=
(
envs
.
VLLM_ATTENTION_BACKEND
==
"FLASHMLA"
)
use_cutlass_mla
=
(
use_cutlass_mla
=
(
envs
.
VLLM_ATTENTION_BACKEND
==
"CUTLASS_MLA
_VLLM_V1
"
)
envs
.
VLLM_ATTENTION_BACKEND
==
"CUTLASS_MLA"
)
from
vllm.attention.ops.flashmla
import
is_flashmla_supported
from
vllm.attention.ops.flashmla
import
is_flashmla_supported
if
use_flashmla
and
is_flashmla_supported
()[
0
]
\
if
use_flashmla
and
is_flashmla_supported
()[
0
]
\
...
@@ -182,7 +182,7 @@ class CudaPlatformBase(Platform):
...
@@ -182,7 +182,7 @@ class CudaPlatformBase(Platform):
if
use_cutlass_mla
and
cache_config
.
block_size
!=
128
:
if
use_cutlass_mla
and
cache_config
.
block_size
!=
128
:
cache_config
.
block_size
=
128
cache_config
.
block_size
=
128
logger
.
info
(
"Forcing kv cache block size to 128 for "
logger
.
info
(
"Forcing kv cache block size to 128 for "
"CUTLASS_MLA
_VLLM_V1
backend."
)
"CUTLASS_MLA backend."
)
compilation_config
=
vllm_config
.
compilation_config
compilation_config
=
vllm_config
.
compilation_config
if
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
if
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
...
@@ -211,9 +211,9 @@ class CudaPlatformBase(Platform):
...
@@ -211,9 +211,9 @@ class CudaPlatformBase(Platform):
kv_cache_dtype
,
block_size
,
use_v1
,
kv_cache_dtype
,
block_size
,
use_v1
,
use_mla
)
->
str
:
use_mla
)
->
str
:
if
use_mla
:
if
use_mla
:
# TODO(lucas): refactor to
be more concise
# TODO(lucas): refactor to be more concise
# we should probably consider factoring out V1 here
# we should probably consider factoring out V1 here
if
selected_backend
==
_Backend
.
CUTLASS_MLA
_VLLM_V1
:
if
selected_backend
==
_Backend
.
CUTLASS_MLA
:
if
use_v1
:
if
use_v1
:
logger
.
info_once
(
"Using Cutlass MLA backend on V1 engine."
)
logger
.
info_once
(
"Using Cutlass MLA backend on V1 engine."
)
return
(
"vllm.v1.attention.backends.mla."
return
(
"vllm.v1.attention.backends.mla."
...
...
vllm/platforms/interface.py
View file @
61445453
...
@@ -53,7 +53,7 @@ class _Backend(enum.Enum):
...
@@ -53,7 +53,7 @@ class _Backend(enum.Enum):
TRITON_MLA_VLLM_V1
=
enum
.
auto
()
TRITON_MLA_VLLM_V1
=
enum
.
auto
()
FLASHMLA_VLLM_V1
=
enum
.
auto
()
FLASHMLA_VLLM_V1
=
enum
.
auto
()
FLASHMLA
=
enum
.
auto
()
# Supported by V1
FLASHMLA
=
enum
.
auto
()
# Supported by V1
CUTLASS_MLA
_VLLM_V1
=
enum
.
auto
()
CUTLASS_MLA
=
enum
.
auto
()
PALLAS
=
enum
.
auto
()
PALLAS
=
enum
.
auto
()
PALLAS_VLLM_V1
=
enum
.
auto
()
PALLAS_VLLM_V1
=
enum
.
auto
()
IPEX
=
enum
.
auto
()
IPEX
=
enum
.
auto
()
...
...
vllm/v1/attention/backends/mla/cutlass_mla.py
View file @
61445453
...
@@ -21,7 +21,7 @@ class CutlassMLABackend(MLACommonBackend):
...
@@ -21,7 +21,7 @@ class CutlassMLABackend(MLACommonBackend):
@
staticmethod
@
staticmethod
def
get_name
()
->
str
:
def
get_name
()
->
str
:
return
"CUTLASS_MLA
_VLLM_V1
"
return
"CUTLASS_MLA"
@
staticmethod
@
staticmethod
def
get_impl_cls
()
->
type
[
"CutlassMLAImpl"
]:
def
get_impl_cls
()
->
type
[
"CutlassMLAImpl"
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment