Commit 1693e754 authored by yiqa's avatar yiqa
Browse files

使用groupgemm完成高吞吐模式适配

parent ce363e89
......@@ -42,7 +42,7 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
sparsity_ignore_list: list[str],
kv_cache_scheme: Optional[dict[str, Any]] = None,
config: Optional[dict[str, Any]] = None,
packed_modules_mapping: Optional[dict[str, list[str]]] = None,
packed_modules_mapping: Optional[dict[str, list[str]]] = None,
):
super().__init__(
target_scheme_map,
......@@ -52,10 +52,10 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
sparsity_ignore_list,
kv_cache_scheme,
config,
packed_modules_mapping,
packed_modules_mapping,
)
@classmethod
def override_quantization_method(
cls, hf_quant_cfg, user_quant) -> Optional[str]:
......@@ -73,7 +73,7 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
prefix: str,
) -> Optional["QuantizeMethodBase"]:
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE # Avoid circular import
# from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.layers.radix_attention import RadixAttention
# Check if the layer is skipped for quantization.
if should_ignore_layer(prefix,
ignore=self.ignore,
......@@ -85,8 +85,8 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
return UnquantizedEmbeddingMethod()#UnquantizedLinearMethod()
layer.scheme = scheme
return CompressedTensorsLinearMethod(self)
# if isinstance(layer, RadixAttention):
# return CompressedTensorsKVCacheMethod(self)
if isinstance(layer, RadixAttention):
return CompressedTensorsKVCacheMethod(self)
if isinstance(layer, FusedMoE):
return CompressedTensorsMarlinMoEMethod.get_moe_method(self, layer)
return None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment