Commit 1693e754 authored by yiqa's avatar yiqa
Browse files

使用groupgemm完成高吞吐模式适配

parent ce363e89
......@@ -73,7 +73,7 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
prefix: str,
) -> Optional["QuantizeMethodBase"]:
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE # Avoid circular import
# from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.layers.radix_attention import RadixAttention
# Check if the layer is skipped for quantization.
if should_ignore_layer(prefix,
ignore=self.ignore,
......@@ -85,8 +85,8 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
return UnquantizedEmbeddingMethod()#UnquantizedLinearMethod()
layer.scheme = scheme
return CompressedTensorsLinearMethod(self)
# if isinstance(layer, RadixAttention):
# return CompressedTensorsKVCacheMethod(self)
if isinstance(layer, RadixAttention):
return CompressedTensorsKVCacheMethod(self)
if isinstance(layer, FusedMoE):
return CompressedTensorsMarlinMoEMethod.get_moe_method(self, layer)
return None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment