update 5090 int8 config (#449)

e39d8438 · gushiqiao · GitHub · fe9aa39a · e39d8438 · e39d8438
Unverified Commit e39d8438 authored Nov 07, 2025 by gushiqiao Committed by GitHub Nov 07, 2025
7 changed files
--- a/configs/seko_talk/5090/seko_talk_5090_bf16.json
+++ b/configs/seko_talk/5090/seko_talk_5090_bf16.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 5,
+    "video_duration": 360,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "adaptive",

--- a/configs/seko_talk/5090/seko_talk_5090_int8.json
+++ b/configs/seko_talk/5090/seko_talk_5090_int8.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn3",
+    "cross_attn_1_type": "sage_attn3",
+    "cross_attn_2_type": "sage_attn3",
+    "sample_guide_scale": 1,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 1,
+    "t5_cpu_offload": false,
+    "clip_cpu_offload": false,
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-q8f",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-q8f"
+}
--- a/configs/seko_talk/5090/seko_talk_5090_int8_8gpu.json
+++ b/configs/seko_talk/5090/seko_talk_5090_int8_8gpu.json
+
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn3",
+    "cross_attn_1_type": "sage_attn3",
+    "cross_attn_2_type": "sage_attn3",
+    "sample_guide_scale": 1,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 1,
+    "t5_cpu_offload": false,
+    "clip_cpu_offload": false,
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-q8f",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-q8f",
+    "parallel": {
+        "seq_p_size": 8,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/lightx2v/common/ops/mm/mm_weight.py
+++ b/lightx2v/common/ops/mm/mm_weight.py
@@ -35,9 +35,14 @@ except ImportError:
    sgl_kernel = None

 try:
-    import q8_kernels.functional as Q8F
+    from q8_kernels.functional.linear import q8_linear
 except ImportError:
-    Q8F = None
+    q8_linear = None
+
+try:
+    from q8_kernels.functional.linear import fp8_linear
+except ImportError:
+    fp8_linear = None

 try:
    import deep_gemm
@@ -820,7 +825,7 @@ class MMWeightWfp8channelAfp8channeldynamicQ8F(MMWeightQuantTemplate):

    def apply(self, input_tensor):
        input_tensor_quant, input_tensor_scale = self.act_quant_func(input_tensor)
-        output_tensor = Q8F.linear.fp8_linear(
+        output_tensor = fp8_linear(
            input_tensor_quant,
            self.weight,
            self.bias.float() if self.bias is not None else None,
@@ -850,7 +855,7 @@ class MMWeightWint8channelAint8channeldynamicQ8F(MMWeightQuantTemplate):

    def apply(self, input_tensor):
        input_tensor_quant, input_tensor_scale = self.act_quant_func(input_tensor)
-        output_tensor = Q8F.linear.q8_linear(
+        output_tensor = q8_linear(
            input_tensor_quant,
            self.weight,
            self.bias.float() if self.bias is not None else None,

--- a/lightx2v/models/input_encoders/hf/q_linear.py
+++ b/lightx2v/models/input_encoders/hf/q_linear.py
@@ -17,9 +17,14 @@ except ModuleNotFoundError:
    quant_int8_per_token_matmul, quantize_activation_per_token_absmax = None, None

 try:
-    import q8_kernels.functional as Q8F
+    from q8_kernels.functional.linear import q8_linear
 except ImportError:
-    Q8F = None
+    q8_linear = None
+
+try:
+    from q8_kernels.functional.linear import fp8_linear
+except ImportError:
+    fp8_linear = None


 class VllmQuantLinearInt8(nn.Module):
@@ -236,7 +241,7 @@ class Q8FQuantLinearInt8(nn.Module):

    def forward(self, x):
        input_tensor_quant, input_tensor_scale = self.act_quant_func(x)
-        output_tensor = Q8F.linear.q8_linear(
+        output_tensor = q8_linear(
            input_tensor_quant,
            self.weight,
            self.bias if self.bias is not None else None,
@@ -282,7 +287,7 @@ class Q8FQuantLinearFp8(nn.Module):

    def forward(self, x):
        input_tensor_quant, input_tensor_scale = self.act_quant_func(x)
-        output_tensor = Q8F.linear.fp8_linear(
+        output_tensor = fp8_linear(
            input_tensor_quant,
            self.weight,
            self.bias if self.bias is not None else None,

--- a/scripts/seko_talk/run_seko_talk_21_5090_int8.sh
+++ b/scripts/seko_talk/run_seko_talk_21_5090_int8.sh
+#!/bin/bash
+
+lightx2v_path=/path/to/Lightx2v
+model_path=/path/to/SekoTalk-Distill-int8
+
+
+export CUDA_VISIBLE_DEVICES=0
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+export SENSITIVE_LAYER_DTYPE=None
+
+python -m lightx2v.infer \
+--model_cls seko_talk \
+--task s2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/5090/seko_talk_5090_int8.json \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
+--save_result_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_21_5090_int8_dist.sh
+++ b/scripts/seko_talk/run_seko_talk_21_5090_int8_dist.sh
+#!/bin/bash
+
+lightx2v_path=/path/to/Lightx2v
+model_path=/path/to/SekoTalk-Distill-int8
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+export SENSITIVE_LAYER_DTYPE=None
+
+torchrun --nproc-per-node 8 -m lightx2v.infer \
+--model_cls seko_talk \
+--task s2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/5090/seko_talk_5090_int8_8gpu.json \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
+--save_result_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4