You need to sign in or sign up before continuing.
Unverified Commit f39037ff authored by Zhiqiang Xie's avatar Zhiqiang Xie Committed by GitHub
Browse files

HiCache Fix (#8288)


Co-authored-by: default avatarpansicheng <sicheng.pan.chn@gmail.com>
parent ce86e201
...@@ -358,6 +358,7 @@ class HiCacheController: ...@@ -358,6 +358,7 @@ class HiCacheController:
if host_indices is None: if host_indices is None:
return None return None
self.mem_pool_host.protect_write(host_indices) self.mem_pool_host.protect_write(host_indices)
torch.cuda.current_stream().synchronize()
self.write_queue.put( self.write_queue.put(
CacheOperation(host_indices, device_indices, node_id, priority) CacheOperation(host_indices, device_indices, node_id, priority)
) )
......
...@@ -378,6 +378,7 @@ class ModelRunner: ...@@ -378,6 +378,7 @@ class ModelRunner:
is_hopper_with_cuda_12_3() is_hopper_with_cuda_12_3()
and is_no_spec_infer_or_topk_one(server_args) and is_no_spec_infer_or_topk_one(server_args)
and is_fa3_default_architecture(self.model_config.hf_config) and is_fa3_default_architecture(self.model_config.hf_config)
and (not server_args.enable_hierarchical_cache)
): ):
server_args.attention_backend = "fa3" server_args.attention_backend = "fa3"
elif _is_hip: elif _is_hip:
...@@ -390,7 +391,9 @@ class ModelRunner: ...@@ -390,7 +391,9 @@ class ModelRunner:
) )
else: else:
# MLA architecture # MLA architecture
if is_hopper_with_cuda_12_3(): if is_hopper_with_cuda_12_3() and (
not server_args.enable_hierarchical_cache
):
server_args.attention_backend = "fa3" server_args.attention_backend = "fa3"
elif is_sm100_supported(): elif is_sm100_supported():
server_args.attention_backend = "flashinfer" server_args.attention_backend = "flashinfer"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment