Unverified Commit ad26f298 authored by Chi-Chih Chang's avatar Chi-Chih Chang Committed by GitHub
Browse files

fix double sparsity initialization (#6905)

parent 8d114f25
......@@ -341,6 +341,14 @@ class ModelRunner:
if server_args.enable_lora:
self.init_lora_manager()
# Init Double Sparsity
if server_args.enable_double_sparsity:
if server_args.ds_heavy_channel_type is None:
raise ValueError(
"Please specify the heavy channel type for double sparsity optimization."
)
self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
# Init memory pool and attention backends
self.init_memory_pool(
min_per_gpu_memory,
......@@ -506,11 +514,6 @@ class ModelRunner:
)
server_args.attention_backend = "triton"
server_args.disable_cuda_graph = True
if server_args.ds_heavy_channel_type is None:
raise ValueError(
"Please specify the heavy channel type for double sparsity optimization."
)
self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
if self.is_multimodal:
if not self.is_multimodal_chunked_prefill_supported:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment