Fix attention backend (#1448)

b3710d2c · Ke Bao · GitHub · c6b6d2e7 · b3710d2c · b3710d2c
Unverified Commit b3710d2c authored Sep 17, 2024 by Ke Bao Committed by GitHub Sep 17, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 4 deletions

python/sglang/srt/model_executor/model_runner.py python/sglang/srt/model_executor/model_runner.py +8 -0

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +0 -4

No files found.
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -86,6 +86,14 @@ class ModelRunner:
        self.is_multimodal_model = is_multimodal_model(
            self.model_config.hf_config.architectures
        )
+        if (
+            self.model_config.attention_arch == AttentionArch.MLA
+            and not self.server_args.disable_mla
+        ):
+            logger.info("MLA optimization is tunred on. Use triton backend.")
+            self.server_args.attention_backend = "triton"
        global_server_args_dict.update(
            {
                "attention_backend": server_args.attention_backend,

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -173,10 +173,6 @@ class ServerArgs:
            self.sampling_backend = "pytorch"
        # Default kernel backends
-        if not self.disable_mla:
-            logger.info("MLA optimization is tunred on. Use triton backend.")
-            self.attention_backend = "triton"
        if self.attention_backend is None:
            self.attention_backend = "flashinfer"