[NVIDIA] disable chunked prefix cache when dp and blackwell is used (#9861)

90dfe3de · Kaixi Hou · GitHub · 9a719b7a · 90dfe3de
Unverified Commit 90dfe3de authored Sep 05, 2025 by Kaixi Hou Committed by GitHub Sep 06, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 0 deletions

python/sglang/srt/model_executor/model_runner.py python/sglang/srt/model_executor/model_runner.py +11 -0

No files found.
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -525,6 +525,17 @@ class ModelRunner:

        if not self.use_mla_backend:
            server_args.disable_chunked_prefix_cache = True
+        # TODO(kaixih@nvidia): remove this once we have a better solution for DP attention.
+        #  For more details, see: https://github.com/sgl-project/sglang/issues/8616
+        elif (
+            self.dp_size > 1
+            and is_sm100_supported()
+            and server_args.attention_backend != "triton"
+        ):
+            logger.info(
+                "Disable chunked prefix cache when dp size > 1 and attention backend is not triton."
+            )
+            server_args.disable_chunked_prefix_cache = True

        if not server_args.disable_chunked_prefix_cache:
            logger.info("Chunked prefix cache is turned on.")