"docs/source/vscode:/vscode.git/clone" did not exist on "967ecb8064aa4058820fae430a824e164f086204"
Unverified Commit 90dfe3de authored by Kaixi Hou's avatar Kaixi Hou Committed by GitHub
Browse files

[NVIDIA] disable chunked prefix cache when dp and blackwell is used (#9861)

parent 9a719b7a
......@@ -525,6 +525,17 @@ class ModelRunner:
if not self.use_mla_backend:
server_args.disable_chunked_prefix_cache = True
# TODO(kaixih@nvidia): remove this once we have a better solution for DP attention.
# For more details, see: https://github.com/sgl-project/sglang/issues/8616
elif (
self.dp_size > 1
and is_sm100_supported()
and server_args.attention_backend != "triton"
):
logger.info(
"Disable chunked prefix cache when dp size > 1 and attention backend is not triton."
)
server_args.disable_chunked_prefix_cache = True
if not server_args.disable_chunked_prefix_cache:
logger.info("Chunked prefix cache is turned on.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment