Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf8a613a
Unverified
Commit
cf8a613a
authored
Apr 23, 2026
by
Xin Yang
Committed by
GitHub
Apr 23, 2026
Browse files
Support only half types for concat_mla_q kernel (#37892)
Signed-off-by:
Xin Yang
<
xyangx@amazon.com
>
parent
01acf96c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
1 deletion
+4
-1
csrc/cache_kernels.cu
csrc/cache_kernels.cu
+4
-1
No files found.
csrc/cache_kernels.cu
View file @
cf8a613a
...
...
@@ -1490,6 +1490,9 @@ void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
TORCH_CHECK
(
ql_nope
.
stride
(
2
)
==
1
,
"ql_nope must have stride 1 in dim 2"
);
TORCH_CHECK
(
q_pe
.
stride
(
2
)
==
1
,
"q_pe must have stride 1 in dim 2"
);
TORCH_CHECK
(
q_out
.
stride
(
2
)
==
1
,
"q_out must have stride 1 in dim 2"
);
TORCH_CHECK
(
ql_nope
.
scalar_type
()
==
at
::
ScalarType
::
Half
||
ql_nope
.
scalar_type
()
==
at
::
ScalarType
::
BFloat16
,
"ql_nope must be float16 or bfloat16 dtype"
);
if
(
num_tokens
==
0
)
return
;
...
...
@@ -1501,7 +1504,7 @@ void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
ql_nope
));
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
VLLM_DISPATCH_
FLOATING
_TYPES
(
ql_nope
.
scalar_type
(),
"concat_mla_q"
,
[
&
]
{
VLLM_DISPATCH_
HALF
_TYPES
(
ql_nope
.
scalar_type
(),
"concat_mla_q"
,
[
&
]
{
vllm
::
ConcatMLAQKernel
<
scalar_t
,
512
><<<
grid_size
,
block_size
,
0
,
stream
>>>
(
q_out
.
data_ptr
<
scalar_t
>
(),
ql_nope
.
data_ptr
<
scalar_t
>
(),
q_pe
.
data_ptr
<
scalar_t
>
(),
num_tokens
,
num_heads
,
q_out
.
stride
(
0
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment