"examples/vscode:/vscode.git/clone" did not exist on "2b04ec2ff7270d2044410378b04d85a194fa3d4a"
Unverified Commit 90dfe3de authored by Kaixi Hou's avatar Kaixi Hou Committed by GitHub
Browse files

[NVIDIA] disable chunked prefix cache when dp and blackwell is used (#9861)

parent 9a719b7a
...@@ -525,6 +525,17 @@ class ModelRunner: ...@@ -525,6 +525,17 @@ class ModelRunner:
if not self.use_mla_backend: if not self.use_mla_backend:
server_args.disable_chunked_prefix_cache = True server_args.disable_chunked_prefix_cache = True
# TODO(kaixih@nvidia): remove this once we have a better solution for DP attention.
# For more details, see: https://github.com/sgl-project/sglang/issues/8616
elif (
self.dp_size > 1
and is_sm100_supported()
and server_args.attention_backend != "triton"
):
logger.info(
"Disable chunked prefix cache when dp size > 1 and attention backend is not triton."
)
server_args.disable_chunked_prefix_cache = True
if not server_args.disable_chunked_prefix_cache: if not server_args.disable_chunked_prefix_cache:
logger.info("Chunked prefix cache is turned on.") logger.info("Chunked prefix cache is turned on.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment