Unverified Commit 81db702e authored by Jiangyun Zhu's avatar Jiangyun Zhu Committed by GitHub
Browse files

[Attention] add `_cudagraph_support` for linear attention (#28934)


Signed-off-by: default avatarzjy0516 <riverclouds.zhu@qq.com>
parent 92effb07
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
AttentionCGSupport,
AttentionMetadataBuilder, AttentionMetadataBuilder,
CommonAttentionMetadata, CommonAttentionMetadata,
split_decodes_and_prefills, split_decodes_and_prefills,
...@@ -35,6 +36,8 @@ class LinearAttentionMetadata: ...@@ -35,6 +36,8 @@ class LinearAttentionMetadata:
class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]): class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
reorder_batch_threshold: int = 1 reorder_batch_threshold: int = 1
_cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
def __init__( def __init__(
self, self,
kv_cache_spec: AttentionSpec, kv_cache_spec: AttentionSpec,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment