updata lightx2v

a1ebc651 · xuwx1 · 5a4db490 · a1ebc651 · a1ebc651 · a1ebc651
Commit a1ebc651 authored Dec 11, 2025 by xuwx1
20 changed files
--- a/configs/qwen_image/qwen_image_i2i.json
+++ b/configs/qwen_image/qwen_image_i2i.json
+{
+    "batchsize": 1,
+    "num_channels_latents": 16,
+    "vae_scale_factor": 8,
+    "infer_steps": 50,
+    "guidance_embeds": false,
+    "num_images_per_prompt": 1,
+    "vae_latents_mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "vae_latents_std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.916
+    ],
+    "vae_z_dim": 16,
+    "feature_caching": "NoCaching",
+    "transformer_in_channels": 64,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "_auto_resize": true,
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "axes_dims_rope": [
+        16,
+        56,
+        56
+    ],
+    "_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
+    "attn_type": "flash_attn3",
+    "do_true_cfg": true,
+    "true_cfg_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 1048576,
+    "USE_IMAGE_ID_IN_PROMPT": false
+}
--- a/configs/qwen_image/qwen_image_i2i_2509.json
+++ b/configs/qwen_image/qwen_image_i2i_2509.json
+{
+    "batchsize": 1,
+    "num_channels_latents": 16,
+    "vae_scale_factor": 8,
+    "infer_steps": 40,
+    "guidance_embeds": false,
+    "num_images_per_prompt": 1,
+    "vae_latents_mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "vae_latents_std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.916
+    ],
+    "vae_z_dim": 16,
+    "feature_caching": "NoCaching",
+    "transformer_in_channels": 64,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "_auto_resize": true,
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "axes_dims_rope": [
+        16,
+        56,
+        56
+    ],
+    "_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
+    "attn_type": "flash_attn3",
+    "do_true_cfg": true,
+    "true_cfg_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true
+}
--- a/configs/qwen_image/qwen_image_i2i_2509_quant.json
+++ b/configs/qwen_image/qwen_image_i2i_2509_quant.json
+{
+    "batchsize": 1,
+    "num_channels_latents": 16,
+    "vae_scale_factor": 8,
+    "infer_steps": 40,
+    "guidance_embeds": false,
+    "num_images_per_prompt": 1,
+    "vae_latents_mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "vae_latents_std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.916
+    ],
+    "vae_z_dim": 16,
+    "feature_caching": "NoCaching",
+    "transformer_in_channels": 64,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "_auto_resize": true,
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "axes_dims_rope": [
+        16,
+        56,
+        56
+    ],
+    "_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
+    "attn_type": "flash_attn3",
+    "do_true_cfg": true,
+    "true_cfg_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true,
+    "dit_quantized": true,
+    "dit_quantized_ckpt": "/path/to/qwen_2509_fp8.safetensors",
+    "dit_quant_scheme": "fp8-sgl"
+}
--- a/configs/qwen_image/qwen_image_i2i_lora.json
+++ b/configs/qwen_image/qwen_image_i2i_lora.json
+{
+    "batchsize": 1,
+    "num_channels_latents": 16,
+    "vae_scale_factor": 8,
+    "infer_steps": 8,
+    "guidance_embeds": false,
+    "num_images_per_prompt": 1,
+    "vae_latents_mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "vae_latents_std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.916
+    ],
+    "vae_z_dim": 16,
+    "feature_caching": "NoCaching",
+    "transformer_in_channels": 64,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "_auto_resize": true,
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "axes_dims_rope": [
+        16,
+        56,
+        56
+    ],
+    "_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
+    "attn_type": "flash_attn3",
+    "do_true_cfg": true,
+    "true_cfg_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 1048576,
+    "USE_IMAGE_ID_IN_PROMPT": false,
+    "lora_configs": [
+        {
+          "path": "/path/to/Qwen-Image-Edit-Lightning-4steps-V1.0.safetensors",
+          "strength": 1.0
+        }
+      ]
+}
--- a/configs/qwen_image/qwen_image_t2i.json
+++ b/configs/qwen_image/qwen_image_t2i.json
+{
+    "batchsize": 1,
+    "_comment": "格式: '宽高比': [width, height]",
+    "aspect_ratios": {
+        "1:1": [
+            1328,
+            1328
+        ],
+        "16:9": [
+            1664,
+            928
+        ],
+        "9:16": [
+            928,
+            1664
+        ],
+        "4:3": [
+            1472,
+            1140
+        ],
+        "3:4": [
+            142,
+            184
+        ]
+    },
+    "aspect_ratio": "16:9",
+    "num_channels_latents": 16,
+    "vae_scale_factor": 8,
+    "infer_steps": 50,
+    "guidance_embeds": false,
+    "num_images_per_prompt": 1,
+    "vae_latents_mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "vae_latents_std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.916
+    ],
+    "vae_z_dim": 16,
+    "feature_caching": "NoCaching",
+    "prompt_template_encode": "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 34,
+    "_auto_resize": false,
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "axes_dims_rope": [
+        16,
+        56,
+        56
+    ],
+    "_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
+    "attn_type": "flash_attn3",
+    "do_true_cfg": true,
+    "true_cfg_scale": 4.0
+}
--- a/configs/seko_talk/5090/seko_talk_5090_bf16.json
+++ b/configs/seko_talk/5090/seko_talk_5090_bf16.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn3",
+    "cross_attn_1_type": "sage_attn3",
+    "cross_attn_2_type": "sage_attn3",
+    "sample_guide_scale": 1,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 1,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": false,
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false,
+    "vae_cpu_offload": false
+}
--- a/configs/seko_talk/5090/seko_talk_5090_int8.json
+++ b/configs/seko_talk/5090/seko_talk_5090_int8.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn3",
+    "cross_attn_1_type": "sage_attn3",
+    "cross_attn_2_type": "sage_attn3",
+    "sample_guide_scale": 1,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 1,
+    "t5_cpu_offload": false,
+    "clip_cpu_offload": false,
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-q8f",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-q8f"
+}
--- a/configs/seko_talk/5090/seko_talk_5090_int8_8gpu.json
+++ b/configs/seko_talk/5090/seko_talk_5090_int8_8gpu.json
+
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn3",
+    "cross_attn_1_type": "sage_attn3",
+    "cross_attn_2_type": "sage_attn3",
+    "sample_guide_scale": 1,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 1,
+    "t5_cpu_offload": false,
+    "clip_cpu_offload": false,
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-q8f",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-q8f",
+    "parallel": {
+        "seq_p_size": 8,
+        "seq_p_attn_type": "ulysses-4090"
+    }
+}
--- a/configs/seko_talk/A800/seko_talk_A800_int8.json
+++ b/configs/seko_talk/A800/seko_talk_A800_int8.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-vllm",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-vllm",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-vllm"
+}
--- a/configs/seko_talk/A800/seko_talk_A800_int8_dist_2gpu.json
+++ b/configs/seko_talk/A800/seko_talk_A800_int8_dist_2gpu.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-vllm",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-vllm",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-vllm",
+    "parallel": {
+        "seq_p_size": 2,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/A800/seko_talk_A800_int8_dist_4gpu.json
+++ b/configs/seko_talk/A800/seko_talk_A800_int8_dist_4gpu.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-vllm",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-vllm",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-vllm",
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/A800/seko_talk_A800_int8_dist_8gpu.json
+++ b/configs/seko_talk/A800/seko_talk_A800_int8_dist_8gpu.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-vllm",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "int8-vllm",
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8-vllm",
+    "parallel": {
+        "seq_p_size": 8,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/L40s/1gpu/seko_talk_bf16.json
+++ b/configs/seko_talk/L40s/1gpu/seko_talk_bf16.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 0.8,
+    "t5_cpu_offload": false,
+    "clip_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false
+}
--- a/configs/seko_talk/L40s/1gpu/seko_talk_fp8.json
+++ b/configs/seko_talk/L40s/1gpu/seko_talk_fp8.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "cpu_offload": false,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": true,
+    "vae_cpu_offload": true,
+    "audio_encoder_cpu_offload": true,
+    "audio_adapter_cpu_offload": true
+}
--- a/configs/seko_talk/L40s/2gpu/seko_talk_bf16.json
+++ b/configs/seko_talk/L40s/2gpu/seko_talk_bf16.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": false,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": true,
+    "vae_cpu_offload": true,
+    "audio_encoder_cpu_offload": true,
+    "audio_adapter_cpu_offload": true,
+    "parallel": {
+        "seq_p_size": 2,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/L40s/2gpu/seko_talk_fp8.json
+++ b/configs/seko_talk/L40s/2gpu/seko_talk_fp8.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "cpu_offload": false,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": true,
+    "vae_cpu_offload": true,
+    "audio_encoder_cpu_offload": true,
+    "audio_adapter_cpu_offload": true,
+    "parallel": {
+        "seq_p_size": 2,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/L40s/4gpu/seko_talk_bf16.json
+++ b/configs/seko_talk/L40s/4gpu/seko_talk_bf16.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": false,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": true,
+    "vae_cpu_offload": true,
+    "audio_encoder_cpu_offload": true,
+    "audio_adapter_cpu_offload": true,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/L40s/4gpu/seko_talk_fp8.json
+++ b/configs/seko_talk/L40s/4gpu/seko_talk_fp8.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "cpu_offload": false,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": true,
+    "vae_cpu_offload": true,
+    "audio_encoder_cpu_offload": true,
+    "audio_adapter_cpu_offload": true,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/L40s/8gpu/seko_talk_bf16.json
+++ b/configs/seko_talk/L40s/8gpu/seko_talk_bf16.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "cpu_offload": false,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": true,
+    "vae_cpu_offload": true,
+    "audio_encoder_cpu_offload": true,
+    "audio_adapter_cpu_offload": true,
+    "parallel": {
+        "seq_p_size": 8,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/seko_talk/L40s/8gpu/seko_talk_fp8.json
+++ b/configs/seko_talk/L40s/8gpu/seko_talk_fp8.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 360,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "use_31_block": false,
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-q8f",
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "cpu_offload": false,
+    "t5_cpu_offload": true,
+    "clip_cpu_offload": true,
+    "vae_cpu_offload": true,
+    "audio_encoder_cpu_offload": true,
+    "audio_adapter_cpu_offload": true,
+    "parallel": {
+        "seq_p_size": 8,
+        "seq_p_attn_type": "ulysses"
+    }
+}