fix bug & add quant kernel

6c18f54c · helloyongyang · 3aa95081 · 6c18f54c · 6c18f54c · 6c18f54c
Commit 6c18f54c authored Apr 09, 2025 by helloyongyang
3 changed files
--- a/lightx2v/attentions/common/flash_attn2.py
+++ b/lightx2v/attentions/common/flash_attn2.py
@@ -4,7 +4,7 @@ except ImportError:
    flash_attn_varlen_func = None
-def flash_attn2(q, k, v, cu_seqlens_q=None, cu_seqlens_kv=None, max_seqlen_q=None, max_seqlen_kv=None):
+def flash_attn2(q, k, v, cu_seqlens_q=None, cu_seqlens_kv=None, max_seqlen_q=None, max_seqlen_kv=None, model_cls=None):
    x = flash_attn_varlen_func(
        q,
        k,

--- a/lightx2v/attentions/common/flash_attn3.py
+++ b/lightx2v/attentions/common/flash_attn3.py
@@ -4,7 +4,7 @@ except ImportError:
    flash_attn_varlen_func_v3 = None
-def flash_attn3(q, k, v, cu_seqlens_q=None, cu_seqlens_kv=None, max_seqlen_q=None, max_seqlen_kv=None):
+def flash_attn3(q, k, v, cu_seqlens_q=None, cu_seqlens_kv=None, max_seqlen_q=None, max_seqlen_kv=None, model_cls=None):
    x = flash_attn_varlen_func_v3(
        q,
        k,

--- a/lightx2v/common/ops/mm/mm_weight.py
+++ b/lightx2v/common/ops/mm/mm_weight.py
@@ -371,6 +371,29 @@ class MMWeightWfp8channelAfp8channeldynamicVllmActSgl(MMWeightQuantTemplate):
        return output_tensor
+@MM_WEIGHT_REGISTER("W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl-ActVllm")
+class MMWeightWfp8channelAfp8channeldynamicSglActVllm(MMWeightQuantTemplate):
+    """
+    Name: W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl-ActVllm
+    Quant MM:
+        Weight: fp8 perchannel sym
+        Act: fp8 perchannel dynamic sym
+        Kernel: quant-mm using Sgl-kernel, act dynamic quant using vllm
+    """
+    def __init__(self, weight_name, bias_name):
+        super().__init__(weight_name, bias_name)
+        self.load_func = self.load_fp8_perchannel_sym
+        self.weight_need_transpose = True
+        self.act_quant_func = self.act_quant_fp8_perchannel_sym_vllm
+    def apply(self, input_tensor):
+        input_tensor_quant, input_tensor_scale = self.act_quant_func(input_tensor)
+        output_tensor = sgl_kernel.fp8_scaled_mm(input_tensor_quant, self.weight, input_tensor_scale, self.weight_scale, torch.bfloat16, bias=self.bias)
+        return output_tensor
 @MM_WEIGHT_REGISTER("W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl")
 class MMWeightWfp8channelAfp8channeldynamicSgl(MMWeightQuantTemplate):
    """
@@ -395,7 +418,7 @@ class MMWeightWfp8channelAfp8channeldynamicSgl(MMWeightQuantTemplate):
 @MM_WEIGHT_REGISTER("W-int8-channel-sym-A-int8-channel-sym-dynamic-Sgl-ActVllm")
-class MMWeightWint8channelAint8channeldynamicActVllm(MMWeightQuantTemplate):
+class MMWeightWint8channelAint8channeldynamicSglActVllm(MMWeightQuantTemplate):
    """
    Name: W-int8-channel-sym-A-int8-channel-sym-dynamic-Sgl-ActVllm