HiCache Fix (#8288)

Co-authored-by: pansicheng <sicheng.pan.chn@gmail.com>

HiCache Fix (#8288)
Co-authored-by: pansicheng <sicheng.pan.chn@gmail.com>
f39037ff · Zhiqiang Xie · GitHub · ce86e201 · f39037ff · f39037ff
Unverified Commit f39037ff authored Jul 23, 2025 by Zhiqiang Xie Committed by GitHub Jul 23, 2025
Showing with 5 additions and 1 deletion

python/sglang/srt/managers/cache_controller.py python/sglang/srt/managers/cache_controller.py +1 -0

python/sglang/srt/model_executor/model_runner.py python/sglang/srt/model_executor/model_runner.py +4 -1

No files found.
--- a/python/sglang/srt/managers/cache_controller.py
+++ b/python/sglang/srt/managers/cache_controller.py
@@ -358,6 +358,7 @@ class HiCacheController:
        if host_indices is None:
            return None
        self.mem_pool_host.protect_write(host_indices)
+        torch.cuda.current_stream().synchronize()
        self.write_queue.put(
            CacheOperation(host_indices, device_indices, node_id, priority)
        )

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -378,6 +378,7 @@ class ModelRunner:
                    is_hopper_with_cuda_12_3()
                    and is_no_spec_infer_or_topk_one(server_args)
                    and is_fa3_default_architecture(self.model_config.hf_config)
+                    and (not server_args.enable_hierarchical_cache)
                ):
                    server_args.attention_backend = "fa3"
                elif _is_hip:
@@ -390,7 +391,9 @@ class ModelRunner:
                    )
            else:
                # MLA architecture
-                if is_hopper_with_cuda_12_3():
+                if is_hopper_with_cuda_12_3() and (
+                    not server_args.enable_hierarchical_cache
+                ):
                    server_args.attention_backend = "fa3"
                elif is_sm100_supported():
                    server_args.attention_backend = "flashinfer"