Update benchmark

e0d23621 · gushiqiao · 8f0d4f4d · e0d23621 · e0d23621 · e0d23621
Commit e0d23621 authored Jul 19, 2025 by gushiqiao
14 changed files
--- a/assets/inputs/imgs/img_2.jpeg
+++ b/assets/inputs/imgs/img_2.jpeg
--- a/configs/bench/lightx2v_3.json
+++ b/configs/bench/lightx2v_3.json
@@ -13,5 +13,6 @@
    "cpu_offload": false,
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl-ActVllm"
-    }
+    },
+    "use_tiling_vae": true
 }
--- a/configs/bench/lightx2v_3_distill.json
+++ b/configs/bench/lightx2v_3_distill.json
 {
-    "infer_steps": 40,
+    "infer_steps": 4,
    "target_video_length": 81,
    "target_height": 480, // 720
    "target_width": 832, // 1280
@@ -9,9 +9,11 @@
    "seed": 42, //1234
    "sample_guide_scale": 5,
    "sample_shift": 5,
-    "enable_cfg": true,
+    "enable_cfg": false,
    "cpu_offload": false,
+    "denoising_step_list": [1000, 750, 500, 250],
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl-ActVllm"
-    }
+    },
+    "use_tiling_vae": true
 }
--- a/configs/bench/lightx2v_4.json
+++ b/configs/bench/lightx2v_4.json
@@ -24,5 +24,6 @@
    // [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
    // ],
    "use_ret_steps": false,
-    "teacache_thresh": 0.2
+    "teacache_thresh": 0.2,
+    "use_tiling_vae": true
 }
--- a/configs/bench/lightx2v_5.json
+++ b/configs/bench/lightx2v_5.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480, // 720
+    "target_width": 832, // 1280
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": true,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 0.8, //1
+    "t5_cpu_offload": true,
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
+    },
+    "use_tiling_vae": true
+}
--- a/configs/bench/lightx2v_5_distill.json
+++ b/configs/bench/lightx2v_5_distill.json
+{
+    "infer_steps": 4,
+    "target_video_length": 81,
+    "target_height": 480, // 720
+    "target_width": 832, // 1280
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 0.8, //1
+    "t5_cpu_offload": true,
+    "denoising_step_list": [1000, 750, 500, 250],
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
+    },
+    "use_tiling_vae": true
+}
--- a/configs/bench/lightx2v_6.json
+++ b/configs/bench/lightx2v_6.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480, // 720
+    "target_width": 832, // 1280
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": true,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 0.8, //1
+    "t5_cpu_offload": true,
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
+    },
+    "use_tiling_vae": true
+}
--- a/configs/bench/lightx2v_6_distill.json
+++ b/configs/bench/lightx2v_6_distill.json
+{
+    "infer_steps": 4,
+    "target_video_length": 81,
+    "target_height": 480, // 720
+    "target_width": 832, // 1280
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio": 0.8, //1
+    "t5_cpu_offload": true,
+    "denoising_step_list": [1000, 750, 500, 250],
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
+    },
+    "use_tiling_vae": true
+}
--- a/docs/EN/source/getting_started/benchmark_source.md
+++ b/docs/EN/source/getting_started/benchmark_source.md
-# Benchmark
+# 🚀 Benchmark
+
+> This document showcases the performance test results of LightX2V across different hardware environments, including detailed comparison data for H200 and RTX 4090 platforms.

 ---

-## H200 (~140GB VRAM)
+## 🖥️ H200 Environment (~140GB VRAM)
+
+### 📋 Software Environment Configuration

-**Software Environment:**
- **Python**: 3.11
- **PyTorch**: 2.7.1+cu128
- **SageAttention**: 2.2.0
- **vLLM**: 0.9.2
- **sgl-kernel**: 0.1.8
+| Component | Version |
+|:----------|:--------|
+| **Python** | 3.11 |
+| **PyTorch** | 2.7.1+cu128 |
+| **SageAttention** | 2.2.0 |
+| **vLLM** | 0.9.2 |
+| **sgl-kernel** | 0.1.8 |

-### 480P 5s Video
+---
+
+### 🎬 480P 5s Video Test

 **Test Configuration:**
 - **Model**: [Wan2.1-I2V-14B-480P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v)
- **Parameters**: infer_steps=40, seed=42, enable_cfg=True
+- **Parameters**: `infer_steps=40`, `seed=42`, `enable_cfg=True`

-#### Performance Comparison
+#### 📊 Performance Comparison Table

 | Configuration | Inference Time(s) | GPU Memory(GB) | Speedup | Video Effect |
 |:-------------|:-----------------:|:--------------:|:-------:|:------------:|
@@ -29,13 +36,15 @@
 | **LightX2V_3-Distill** | 14 | 35 | **🏆 20.85x** | <video src="https://github.com/user-attachments/assets/b4dc403c-919d-4ba1-b29f-ef53640c0334" width="200px"></video> |
 | **LightX2V_4** | 107 | 35 | **3.41x** | <video src="https://github.com/user-attachments/assets/49cd2760-4be2-432c-bf4e-01af9a1303dd" width="200px"></video> |

-### 720P 5s Video
+---
+
+### 🎬 720P 5s Video Test

 **Test Configuration:**
 - **Model**: [Wan2.1-I2V-14B-720P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v)
- **Parameters**: infer_steps=40, seed=1234, enable_cfg=True
+- **Parameters**: `infer_steps=40`, `seed=1234`, `enable_cfg=True`

-#### Performance Comparison
+#### 📊 Performance Comparison Table

 | Configuration | Inference Time(s) | GPU Memory(GB) | Speedup | Video Effect |
 |:-------------|:-----------------:|:--------------:|:-------:|:------------:|
@@ -49,27 +58,92 @@

 ---

-## RTX 4090 (~24GB VRAM)
+## 🖥️ RTX 4090 Environment (~24GB VRAM)
+
+### 📋 Software Environment Configuration
+
+| Component | Version |
+|:----------|:--------|
+| **Python** | 3.9.16 |
+| **PyTorch** | 2.5.1+cu124 |
+| **SageAttention** | 2.1.0 |
+| **vLLM** | 0.6.6 |
+| **sgl-kernel** | 0.0.5 |
+| **q8-kernels** | 0.0.0 |
+
+---
+
+### 🎬 480P 5s Video Test
+
+**Test Configuration:**
+- **Model**: [Wan2.1-I2V-14B-480P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v)
+- **Parameters**: `infer_steps=40`, `seed=42`, `enable_cfg=True`
+
+#### 📊 Performance Comparison Table
+
+| Configuration | Inference Time(s) | GPU Memory(GB) | Speedup | Video Effect |
+|:-------------|:-----------------:|:--------------:|:-------:|:------------:|
+| **Wan2GP(profile=3)** | 779 | 20 | **1.0x** | <video src="" width="200px"></video> |
+| **LightX2V_5** | 738 | 16 | **1.05x** | <video src="" width="200px"></video> |
+| **LightX2V_5-Distill** | 68 | 16 | **11.45x** | <video src="" width="200px"></video> |
+| **LightX2V_6** | 630 | 12 | **1.24x** | <video src="" width="200px"></video> |
+| **LightX2V_6-Distill** | 63 | 12 | **🏆 12.36x** | <video src="" width="200px"></video> |
+
+---
+
+### 🎬 720P 5s Video Test
+
+**Test Configuration:**
+- **Model**: [Wan2.1-I2V-14B-720P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v)
+- **Parameters**: `infer_steps=40`, `seed=1234`, `enable_cfg=True`
+
+#### 📊 Performance Comparison Table
+
+| Configuration | Inference Time(s) | GPU Memory(GB) | Speedup | Video Effect |
+|:-------------|:-----------------:|:--------------:|:-------:|:------------:|
+| **Wan2GP(profile=3)** | -- | OOM | -- | <video src="--" width="200px"></video> |
+| **LightX2V_5** | 2473 | 23 | -- | <video src="" width="200px"></video> |
+| **LightX2V_5-Distill** | 183 | 23 | -- | <video src="" width="200px"></video> |
+| **LightX2V_6** | 2169 | 18 | -- | <video src="" width="200px"></video> |
+| **LightX2V_6-Distill** | 171 | 18 | -- | <video src="" width="200px"></video> |
+
+---
+
+## 📖 Configuration Descriptions
+
+### 🖥️ H200 Environment Configuration Descriptions

-### 480P 5s Video
+| Configuration | Technical Features |
+|:--------------|:------------------|
+| **Wan2.1 Official** | Based on [Wan2.1 official repository](https://github.com/Wan-Video/Wan2.1) original implementation |
+| **FastVideo** | Based on [FastVideo official repository](https://github.com/hao-ai-lab/FastVideo), using SageAttention2 backend optimization |
+| **LightX2V_1** | Uses SageAttention2 to replace native attention mechanism, adopts DIT BF16+FP32 (partial sensitive layers) mixed precision computation, improving computational efficiency while maintaining precision |
+| **LightX2V_2** | Unified BF16 precision computation, further reducing memory usage and computational overhead while maintaining generation quality |
+| **LightX2V_3** | Introduces FP8 quantization technology to significantly reduce computational precision requirements, combined with Tiling VAE technology to optimize memory usage |
+| **LightX2V_3-Distill** | Based on LightX2V_3 using 4-step distillation model(`infer_steps=4`, `enable_cfg=False`), further reducing inference steps while maintaining generation quality |
+| **LightX2V_4** | Based on LightX2V_3 with TeaCache(teacache_thresh=0.2) caching reuse technology, achieving acceleration through intelligent redundant computation skipping |
+
+### 🖥️ RTX 4090 Environment Configuration Descriptions
+
+| Configuration | Technical Features |
+|:--------------|:------------------|
+| **Wan2GP(profile=3)** | Implementation based on [Wan2GP repository](https://github.com/deepbeepmeep/Wan2GP), using MMGP optimization technology. Profile=3 configuration is suitable for RTX 3090/4090 environments with at least 32GB RAM and 24GB VRAM, adapting to limited memory resources by sacrificing VRAM. Uses quantized models: [480P model](https://huggingface.co/DeepBeepMeep/Wan2.1/blob/main/wan2.1_image2video_480p_14B_quanto_mbf16_int8.safetensors) and [720P model](https://huggingface.co/DeepBeepMeep/Wan2.1/blob/main/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors) |
+| **LightX2V_5** | Uses SageAttention2 to replace native attention mechanism, adopts DIT FP8+FP32 (partial sensitive layers) mixed precision computation, enables CPU offload technology, executes partial sensitive layers with FP32 precision, asynchronously offloads DIT inference process data to CPU, saves VRAM, with block-level offload granularity |
+| **LightX2V_5-Distill** | Based on LightX2V_5 using 4-step distillation model(`infer_steps=4`, `enable_cfg=False`), further reducing inference steps while maintaining generation quality |
+| **LightX2V_6** | Based on LightX2V_3 with CPU offload technology enabled, executes partial sensitive layers with FP32 precision, asynchronously offloads DIT inference process data to CPU, saves VRAM, with block-level offload granularity |
+| **LightX2V_6-Distill** | Based on LightX2V_6 using 4-step distillation model(`infer_steps=4`, `enable_cfg=False`), further reducing inference steps while maintaining generation quality |
+
+---

-*Coming soon...*
+## 📁 Configuration Files Reference

-### 720P 5s Video
+Benchmark-related configuration files and execution scripts are available at:

-*Coming soon...*
+| Type | Link | Description |
+|:-----|:-----|:------------|
+| **Configuration Files** | [configs/bench](https://github.com/ModelTC/LightX2V/tree/main/configs/bench) | Contains JSON files with various optimization configurations |
+| **Execution Scripts** | [scripts/bench](https://github.com/ModelTC/LightX2V/tree/main/scripts/bench) | Contains benchmark execution scripts |

 ---

-## Configuration Descriptions
-
- **Wan2.1 Official**: Based on [Wan2.1 official repository](https://github.com/Wan-Video/Wan2.1)
- **FastVideo**: Based on [FastVideo official repository](https://github.com/hao-ai-lab/FastVideo), using SageAttention backend
- **LightX2V_1**: Uses SageAttention2 to replace native attention mechanism, adopts DIT BF16+FP32 (partial sensitive layers) mixed precision computation, improving computational efficiency while maintaining precision
- **LightX2V_2**: Unified BF16 precision computation, further reducing memory usage and computational overhead while maintaining generation quality
- **LightX2V_3**: Introduces FP8 quantization technology to significantly reduce computational precision requirements, combined with Tiling VAE technology to optimize memory usage
- **LightX2V_3-Distill**: Based on LightX2V_3 using 4-step distillation model(`infer_step=4`, `enable_cfg=False`), further reducing inference steps while maintaining generation quality.
- **LightX2V_4**: Based on LightX2V_3 with TeaCache(teacache_thresh=0.2) caching reuse technology, achieving acceleration through intelligent redundant computation skipping
- **Configuration Files Reference**: Benchmark-related configuration files and execution scripts are available at:
-  - [Configuration Files](https://github.com/ModelTC/LightX2V/tree/main/configs/bench) - Contains JSON files with various optimization configurations
-  - [Execution Scripts](https://github.com/ModelTC/LightX2V/tree/main/scripts/bench) - Contains benchmark execution scripts
+> 💡 **Tip**: It is recommended to choose the appropriate optimization solution based on your hardware configuration to achieve the best performance.
--- a/docs/ZH_CN/source/getting_started/benchmark_source.md
+++ b/docs/ZH_CN/source/getting_started/benchmark_source.md
-# 基准测试
+# 🚀 基准测试
+
+> 本文档展示了LightX2V在不同硬件环境下的性能测试结果，包括H200和RTX 4090平台的详细对比数据。

 ---

-## H200 (~140GB显存)
+## 🖥️ H200 环境 (~140GB显存)
+
+### 📋 软件环境配置

-**软件环境配置：**
- **Python**: 3.11
- **PyTorch**: 2.7.1+cu128
- **SageAttention**: 2.2.0
- **vLLM**: 0.9.2
- **sgl-kernel**: 0.1.8
+| 组件 | 版本 |
+|:-----|:-----|
+| **Python** | 3.11 |
+| **PyTorch** | 2.7.1+cu128 |
+| **SageAttention** | 2.2.0 |
+| **vLLM** | 0.9.2 |
+| **sgl-kernel** | 0.1.8 |

-### 480P 5s视频
+---
+
+### 🎬 480P 5s视频测试

 **测试配置:**
 - **模型**: [Wan2.1-I2V-14B-480P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v)
- **参数**: infer_steps=40, seed=42, enable_cfg=True
+- **参数**: `infer_steps=40`, `seed=42`, `enable_cfg=True`

-#### 性能对比
+#### 📊 性能对比表

 | 配置 | 推理时间(s) | GPU显存占用(GB) | 加速比 | 视频效果 |
 |:-----|:----------:|:---------------:|:------:|:--------:|
@@ -29,14 +36,15 @@
 | **LightX2V_3-Distill** | 14 | 35 | **🏆 20.85x** | <video src="https://github.com/user-attachments/assets/b4dc403c-919d-4ba1-b29f-ef53640c0334" width="200px"></video> |
 | **LightX2V_4** | 107 | 35 | **3.41x** | <video src="https://github.com/user-attachments/assets/49cd2760-4be2-432c-bf4e-01af9a1303dd" width="200px"></video> |

-### 720P 5s视频
+---
+
+### 🎬 720P 5s视频测试

 **测试配置:**
 - **模型**: [Wan2.1-I2V-14B-720P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v)
- **参数**: infer_steps=40, seed=1234, enable_cfg=True
-
-#### 性能对比
+- **参数**: `infer_steps=40`, `seed=1234`, `enable_cfg=True`

+#### 📊 性能对比表

 | 配置 | 推理时间(s) | GPU显存占用(GB) | 加速比 | 视频效果 |
 |:-----|:----------:|:---------------:|:------:|:--------:|
@@ -50,27 +58,92 @@

 ---

-## RTX 4090 (~24GB显存)
+## 🖥️ RTX 4090 环境 (~24GB显存)
+
+### 📋 软件环境配置
+
+| 组件 | 版本 |
+|:-----|:-----|
+| **Python** | 3.9.16 |
+| **PyTorch** | 2.5.1+cu124 |
+| **SageAttention** | 2.1.0 |
+| **vLLM** | 0.6.6 |
+| **sgl-kernel** | 0.0.5 |
+| **q8-kernels** | 0.0.0 |
+
+---
+
+### 🎬 480P 5s视频测试
+
+**测试配置:**
+- **模型**: [Wan2.1-I2V-14B-480P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v)
+- **参数**: `infer_steps=40`, `seed=42`, `enable_cfg=True`
+
+#### 📊 性能对比表
+
+| 配置 | 推理时间(s) | GPU显存占用(GB) | 加速比 | 视频效果 |
+|:-----|:----------:|:---------------:|:------:|:--------:|
+| **Wan2GP(profile=3)** | 779 | 20 | **1.0x** | <video src="" width="200px"></video> |
+| **LightX2V_5** | 738 | 16 | **1.05x** | <video src="" width="200px"></video> |
+| **LightX2V_5-Distill** | 68 | 16 | **11.45x** | <video src="" width="200px"></video> |
+| **LightX2V_6** | 630 | 12 | **1.24x** | <video src="" width="200px"></video> |
+| **LightX2V_6-Distill** | 63 | 12 | **🏆 12.36x** | <video src="" width="200px"></video> |

-### 480P 5s视频
+---
+
+### 🎬 720P 5s视频测试
+
+**测试配置:**
+- **模型**: [Wan2.1-I2V-14B-720P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v)
+- **参数**: `infer_steps=40`, `seed=1234`, `enable_cfg=True`
+
+#### 📊 性能对比表
+
+| 配置 | 推理时间(s) | GPU显存占用(GB) | 加速比 | 视频效果 |
+|:-----|:----------:|:---------------:|:------:|:--------:|
+| **Wan2GP(profile=3)** | -- | OOM | -- | <video src="--" width="200px"></video> |
+| **LightX2V_5** | 2473 | 23 | -- | <video src="" width="200px"></video> |
+| **LightX2V_5-Distill** | 183 | 23 | -- | <video src="" width="200px"></video> |
+| **LightX2V_6** | 2169 | 18 | -- | <video src="" width="200px"></video> |
+| **LightX2V_6-Distill** | 171 | 18 | -- | <video src="" width="200px"></video> |
+
+---
+
+## 📖 配置说明
+
+### 🖥️ H200 环境配置说明
+
+| 配置 | 技术特点 |
+|:-----|:---------|
+| **Wan2.1 Official** | 基于[Wan2.1官方仓库](https://github.com/Wan-Video/Wan2.1)的原始实现 |
+| **FastVideo** | 基于[FastVideo官方仓库](https://github.com/hao-ai-lab/FastVideo)，使用SageAttention2后端优化 |
+| **LightX2V_1** | 使用SageAttention2替换原生注意力机制，采用DIT BF16+FP32(部分敏感层)混合精度计算，在保持精度的同时提升计算效率 |
+| **LightX2V_2** | 统一使用BF16精度计算，进一步减少显存占用和计算开销，同时保持生成质量 |
+| **LightX2V_3** | 引入FP8量化技术显著减少计算精度要求，结合Tiling VAE技术优化显存使用 |
+| **LightX2V_3-Distill** | 在LightX2V_3基础上使用4步蒸馏模型(`infer_steps=4`, `enable_cfg=False`)，进一步减少推理步数并保持生成质量 |
+| **LightX2V_4** | 在LightX2V_3基础上加入TeaCache(teacache_thresh=0.2)缓存复用技术，通过智能跳过冗余计算实现加速 |
+
+### 🖥️ RTX 4090 环境配置说明
+
+| 配置 | 技术特点 |
+|:-----|:---------|
+| **Wan2GP(profile=3)** | 基于[Wan2GP仓库](https://github.com/deepbeepmeep/Wan2GP)实现，使用MMGP优化技术。profile=3配置适用于至少32GB内存和24GB显存的RTX 3090/4090环境，通过牺牲显存来适应有限的内存资源。使用量化模型：[480P模型](https://huggingface.co/DeepBeepMeep/Wan2.1/blob/main/wan2.1_image2video_480p_14B_quanto_mbf16_int8.safetensors)和[720P模型](https://huggingface.co/DeepBeepMeep/Wan2.1/blob/main/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors) |
+| **LightX2V_5** | 使用SageAttention2替换原生注意力机制，采用DIT FP8+FP32(部分敏感层)混合精度计算，启用CPU offload技术，将部分敏感层执行FP32精度计算，将DIT推理过程中异步数据卸载到CPU上，节省显存，offload粒度为block级别 |
+| **LightX2V_5-Distill** | 在LightX2V_5基础上使用4步蒸馏模型(`infer_steps=4`, `enable_cfg=False`)，进一步减少推理步数并保持生成质量 |
+| **LightX2V_6** | 在LightX2V_3基础上启用CPU offload技术，将部分敏感层执行FP32精度计算，将DIT推理过程中异步数据卸载到CPU上，节省显存，offload粒度为block级别 |
+| **LightX2V_6-Distill** | 在LightX2V_6基础上使用4步蒸馏模型(`infer_steps=4`, `enable_cfg=False`)，进一步减少推理步数并保持生成质量 |
+
+---

-*即将更新...*
+## 📁 配置文件参考

-### 720P 5s视频
+基准测试相关的配置文件和运行脚本可在以下位置获取：

-*即将更新...*
+| 类型 | 链接 | 说明 |
+|:-----|:-----|:-----|
+| **配置文件** | [configs/bench](https://github.com/ModelTC/LightX2V/tree/main/configs/bench) | 包含各种优化配置的JSON文件 |
+| **运行脚本** | [scripts/bench](https://github.com/ModelTC/LightX2V/tree/main/scripts/bench) | 包含基准测试的执行脚本 |

 ---

-## 表格说明
-
- **Wan2.1 Official**: 基于[Wan2.1官方仓库](https://github.com/Wan-Video/Wan2.1)
- **FastVideo**: 基于[FastVideo官方仓库](https://github.com/hao-ai-lab/FastVideo)，使用SageAttention后端
- **LightX2V_1**: 使用SageAttention2替换原生注意力机制，采用DIT BF16+FP32(部分敏感层)混合精度计算，在保持精度的同时提升计算效率
- **LightX2V_2**: 统一使用BF16精度计算，进一步减少显存占用和计算开销，同时保持生成质量
- **LightX2V_3**: 引入FP8量化技术显著减少计算精度要求，结合Tiling VAE技术优化显存使用
- **LightX2V_3-Distill**: 在LightX2V_3基础上使用4步蒸馏模型(`infer_step=4`, `enable_cfg=False`)，进一步减少推理步数并保持生成质量。
- **LightX2V_4**: 在LightX2V_3基础上加入TeaCache(teacache_thresh=0.2)缓存复用技术，通过智能跳过冗余计算实现加速
- **配置文件参考**: 基准测试相关的配置文件和运行脚本可在以下位置获取：
-  - [配置文件](https://github.com/ModelTC/LightX2V/tree/main/configs/bench) - 包含各种优化配置的JSON文件
-  - [运行脚本](https://github.com/ModelTC/LightX2V/tree/main/scripts/bench) - 包含基准测试的执行脚本
+> 💡 **提示**: 建议根据您的硬件配置选择合适的优化方案，以获得最佳的性能表现。
--- a/scripts/bench/run_lightx2v_5.sh
+++ b/scripts/bench/run_lightx2v_5.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=/path/to/lightx2v
+model_path=/path/to/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v
+# model_path=/path/to/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls wan2.1 \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/bench/lightx2v_5.json \
+--prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
+--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
+--save_video_path ${lightx2v_path}/save_results/lightx2v_5.mp4
--- a/scripts/bench/run_lightx2v_5_distill.sh
+++ b/scripts/bench/run_lightx2v_5_distill.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=/path/to/lightx2v
+model_path=/path/to/lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v
+# model_path=/path/to/lightx2v/Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls wan2.1 \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/bench/lightx2v_5_distill.json \
+--prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
+--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
+--save_video_path ${lightx2v_path}/save_results/lightx2v_5_distill.mp4
--- a/scripts/bench/run_lightx2v_6.sh
+++ b/scripts/bench/run_lightx2v_6.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=/path/to/lightx2v
+model_path=/path/to/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v
+# model_path=/path/to/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+export DTYPE=BF16
+
+python -m lightx2v.infer \
+--model_cls wan2.1 \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/bench/lightx2v_6.json \
+--prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
+--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
+--save_video_path ${lightx2v_path}/save_results/lightx2v_6.mp4
--- a/scripts/bench/run_lightx2v_6_distill.sh
+++ b/scripts/bench/run_lightx2v_6_distill.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=/path/to/lightx2v
+model_path=/path/to/lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v
+# model_path=/path/to/lightx2v/Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+export DTYPE=BF16
+
+python -m lightx2v.infer \
+--model_cls wan2.1 \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/bench/lightx2v_6_distill.json \
+--prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
+--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
+--save_video_path ${lightx2v_path}/save_results/lightx2v_6_distill.mp4