Unverified Commit b3710d2c authored by Ke Bao's avatar Ke Bao Committed by GitHub
Browse files

Fix attention backend (#1448)

parent c6b6d2e7
...@@ -86,6 +86,14 @@ class ModelRunner: ...@@ -86,6 +86,14 @@ class ModelRunner:
self.is_multimodal_model = is_multimodal_model( self.is_multimodal_model = is_multimodal_model(
self.model_config.hf_config.architectures self.model_config.hf_config.architectures
) )
if (
self.model_config.attention_arch == AttentionArch.MLA
and not self.server_args.disable_mla
):
logger.info("MLA optimization is tunred on. Use triton backend.")
self.server_args.attention_backend = "triton"
global_server_args_dict.update( global_server_args_dict.update(
{ {
"attention_backend": server_args.attention_backend, "attention_backend": server_args.attention_backend,
......
...@@ -173,10 +173,6 @@ class ServerArgs: ...@@ -173,10 +173,6 @@ class ServerArgs:
self.sampling_backend = "pytorch" self.sampling_backend = "pytorch"
# Default kernel backends # Default kernel backends
if not self.disable_mla:
logger.info("MLA optimization is tunred on. Use triton backend.")
self.attention_backend = "triton"
if self.attention_backend is None: if self.attention_backend is None:
self.attention_backend = "flashinfer" self.attention_backend = "flashinfer"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment