Merge branch 'main' into audio_r2v

e08c4f90 · sandy · GitHub · 12bfd120 · 6d07a72e · e08c4f90
Commit e08c4f90 authored Jul 17, 2025 by sandy Committed by GitHub Jul 17, 2025
20 changed files
--- a/configs/offload/phase/wan_i2v_phase.json
+++ b/configs/offload/phase/wan_i2v_phase.json
@@ -12,11 +12,11 @@
    "enable_cfg": true,
    "cpu_offload": true,
    "offload_granularity": "phase",
+    "t5_cpu_offload": true,
    "t5_offload_granularity": "block",
    "dit_quantized_ckpt": "/path/to/dit_int8",
    "mm_config": {
-        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F",
-        "weight_auto_quant": false
+        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F"
    },
    "use_tiling_vae": true
 }
--- a/configs/offload/phase/wan_t2v_phase.json
+++ b/configs/offload/phase/wan_t2v_phase.json
@@ -12,13 +12,13 @@
    "sample_shift": 8,
    "enable_cfg": true,
    "cpu_offload": true,
+    "t5_cpu_offload": true,
    "offload_granularity": "phase",
    "dit_quantized_ckpt": "/path/to/dit_int8",
    "mm_config": {
-        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F",
-        "weight_auto_quant": false
+        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F"
    },
    "tiny_vae": true,
-    "tiny_vae_path": "/mnt/afs_2/gushiqiao/x2v_models/taew2_1.pth",
+    "tiny_vae_path": "/x2v_models/taew2_1.pth",
    "t5_offload_granularity": "block"
 }
--- a/configs/quantization/hunyuan_i2v_auto.json
+++ b/configs/quantization/hunyuan_i2v_auto.json
@@ -6,9 +6,8 @@
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
    "seed": 0,
-    "dit_quantized_ckpt": "/mtc/gushiqiao/llmc_workspace/x2v_models/hunyuan/hunyuan_i2v_int8.pth",
+    "dit_quantized_ckpt": "/path/to/int8/model",
    "mm_config": {
-        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
-        "weight_auto_quant": true
+        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm"
    }
 }
--- a/configs/quantization/hunyuan_i2v_offline.json
+++ b/configs/quantization/hunyuan_i2v_offline.json
-{
-    "infer_steps": 20,
-    "target_video_length": 33,
-    "i2v_resolution": "720p",
-    "self_attn_1_type": "flash_attn3",
-    "cross_attn_1_type": "flash_attn3",
-    "cross_attn_2_type": "flash_attn3",
-    "seed": 0,
-    "dit_quantized_ckpt": "/mtc/gushiqiao/llmc_workspace/x2v_models/hunyuan/hunyuan_i2v_int8.pth",
-    "mm_config": {
-        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
-        "weight_auto_quant": false
-    }
-}
--- a/configs/quantization/readme.md
+++ b/configs/quantization/readme.md
-### TODO
--- a/configs/quantization/wan_i2v_quant_offline.json
+++ b/configs/quantization/wan_i2v_quant_offline.json
@@ -13,7 +13,6 @@
    "cpu_offload": false,
    "dit_quantized_ckpt": "/path/to/int8/model",
    "mm_config": {
-        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
-        "weight_auto_quant": false
+        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm"
    }
 }
--- a/configs/wan/readme.md
+++ b/configs/wan/readme.md
-### TODO
--- a/docs/EN/source/conf.py
+++ b/docs/EN/source/conf.py
@@ -25,7 +25,7 @@ sys.path.append(os.path.abspath("../.."))
 # -- Project information -----------------------------------------------------

 project = "Lightx2v"
-copyright = "2024, Lightx2v Team"
+copyright = "2025, Lightx2v Team"
 author = "the Lightx2v Team"

 # -- General configuration ---------------------------------------------------

--- a/docs/EN/source/deploy_guides/deploy_comfyui.md
+++ b/docs/EN/source/deploy_guides/deploy_comfyui.md
-# comfyui部署
+# ComfyUI Deployment

-xxx
+## ComfyUI-Lightx2vWrapper
+
+The official ComfyUI integration nodes for LightX2V are now available in a dedicated repository, providing a complete modular configuration system and optimization features.
+
+### Project Repository
+
+- GitHub: [https://github.com/ModelTC/ComfyUI-Lightx2vWrapper](https://github.com/ModelTC/ComfyUI-Lightx2vWrapper)
+
+### Key Features
+
+- Modular Configuration System: Separate nodes for each aspect of video generation
+- Support for both Text-to-Video (T2V) and Image-to-Video (I2V) generation modes
+- Advanced Optimizations:
+  - TeaCache acceleration (up to 3x speedup)
+  - Quantization support (int8, fp8)
+  - Memory optimization with CPU offloading
+  - Lightweight VAE options
+- LoRA Support: Chain multiple LoRA models for customization
+- Multiple Model Support: wan2.1, hunyuan architectures
+
+### Installation and Usage
+
+Please visit the GitHub repository above for detailed installation instructions, usage tutorials, and example workflows.
--- a/docs/EN/source/deploy_guides/deploy_gradio.md
+++ b/docs/EN/source/deploy_guides/deploy_gradio.md
-# gradio部署
+# Gradio Deployment

-xxx
+## 📖 Overview
+
+Lightx2v is a lightweight video inference and generation engine that provides a web interface based on Gradio, supporting both Image-to-Video and Text-to-Video generation modes.
+
+This project contains two main demo files:
+- `gradio_demo.py` - English interface version
+- `gradio_demo_zh.py` - Chinese interface version
+
+## 🚀 Quick Start
+
+### System Requirements
+
+- Python 3.10+ (recommended)
+- CUDA 12.4+ (recommended)
+- At least 8GB GPU VRAM
+- At least 16GB system memory (preferably at least 32GB)
+- At least 128GB SSD solid-state drive (**💾 Strongly recommend using SSD solid-state drives to store model files! During "lazy loading" startup, significantly improves model loading speed and inference performance**)
+
+### Install Dependencies
+
+```bash
+# Install basic dependencies
+pip install -r requirements.txt
+pip install gradio
+```
+
+#### Recommended Optimization Library Configuration
+
+- ✅ [Flash attention](https://github.com/Dao-AILab/flash-attention)
+- ✅ [Sage attention](https://github.com/thu-ml/SageAttention)
+- ✅ [vllm-kernel](https://github.com/vllm-project/vllm)
+- ✅ [sglang-kernel](https://github.com/sgl-project/sglang/tree/main/sgl-kernel)
+- ✅ [q8-kernel](https://github.com/KONAKONA666/q8_kernels) (only supports ADA architecture GPUs)
+
+### 🤖 Supported Models
+
+#### 🎬 Image-to-Video Models
+
+| Model Name | Resolution | Parameters | Features | Recommended Use |
+|------------|------------|------------|----------|-----------------|
+| ✅ [Wan2.1-I2V-14B-480P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v) | 480p | 14B | Standard version | Balance speed and quality |
+| ✅ [Wan2.1-I2V-14B-720P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v) | 720p | 14B | HD version | Pursue high-quality output |
+| ✅ [Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v) | 480p | 14B | Distilled optimized version | Faster inference speed |
+| ✅ [Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v) | 720p | 14B | HD distilled version | High quality + fast inference |
+
+#### 📝 Text-to-Video Models
+
+| Model Name | Parameters | Features | Recommended Use |
+|------------|------------|----------|-----------------|
+| ✅ [Wan2.1-T2V-1.3B-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-T2V-1.3B-Lightx2v) | 1.3B | Lightweight | Fast prototyping and testing |
+| ✅ [Wan2.1-T2V-14B-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-T2V-14B-Lightx2v) | 14B | Standard version | Balance speed and quality |
+| ✅ [Wan2.1-T2V-14B-StepDistill-CfgDistill-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-T2V-14B-StepDistill-CfgDistill-Lightx2v) | 14B | Distilled optimized version | High quality + fast inference |
+
+**💡 Model Selection Recommendations**:
+- **First-time use**: Recommend choosing distilled versions
+- **Pursuing quality**: Choose 720p resolution or 14B parameter models
+- **Pursuing speed**: Choose 480p resolution or 1.3B parameter models
+- **Resource-constrained**: Prioritize distilled versions and lower resolutions
+
+### Startup Methods
+
+#### Method 1: Using Startup Script (Recommended)
+
+```bash
+# 1. Edit the startup script to configure relevant paths
+cd app/
+vim run_gradio.sh
+
+# Configuration items that need to be modified:
+# - lightx2v_path: Lightx2v project root directory path
+# - i2v_model_path: Image-to-video model path
+# - t2v_model_path: Text-to-video model path
+
+# 💾 Important note: Recommend pointing model paths to SSD storage locations
+# Example: /mnt/ssd/models/ or /data/ssd/models/
+
+# 2. Run the startup script
+bash run_gradio.sh
+
+# 3. Or start with parameters (recommended)
+bash run_gradio.sh --task i2v --lang en --model_size 14b --port 8032
+# bash run_gradio.sh --task i2v --lang en --model_size 14b --port 8032
+# bash run_gradio.sh --task i2v --lang en --model_size 1.3b --port 8032
+```
+
+#### Method 2: Direct Command Line Startup
+
+**Image-to-Video Mode:**
+```bash
+python gradio_demo.py \
+    --model_path /path/to/Wan2.1-I2V-14B-720P-Lightx2v \
+    --model_size 14b \
+    --task i2v \
+    --server_name 0.0.0.0 \
+    --server_port 7862
+```
+
+**Text-to-Video Mode:**
+```bash
+python gradio_demo.py \
+    --model_path /path/to/Wan2.1-T2V-1.3B \
+    --model_size 1.3b \
+    --task t2v \
+    --server_name 0.0.0.0 \
+    --server_port 7862
+```
+
+**Chinese Interface Version:**
+```bash
+python gradio_demo_zh.py \
+    --model_path /path/to/model \
+    --model_size 14b \
+    --task i2v \
+    --server_name 0.0.0.0 \
+    --server_port 7862
+```
+
+## 📋 Command Line Parameters
+
+| Parameter | Type | Required | Default | Description |
+|-----------|------|----------|---------|-------------|
+| `--model_path` | str | ✅ | - | Model folder path |
+| `--model_cls` | str | ❌ | wan2.1 | Model class (currently only supports wan2.1) |
+| `--model_size` | str | ✅ | - | Model size: `14b(t2v or i2v)` or `1.3b(t2v)` |
+| `--task` | str | ✅ | - | Task type: `i2v` (image-to-video) or `t2v` (text-to-video) |
+| `--server_port` | int | ❌ | 7862 | Server port |
+| `--server_name` | str | ❌ | 0.0.0.0 | Server IP address |
+
+## 🎯 Features
+
+### Basic Settings
+
+#### Input Parameters
+- **Prompt**: Describe the expected video content
+- **Negative Prompt**: Specify elements you don't want to appear
+- **Resolution**: Supports multiple preset resolutions (480p/540p/720p)
+- **Random Seed**: Controls the randomness of generation results
+- **Inference Steps**: Affects the balance between generation quality and speed
+
+#### Video Parameters
+- **FPS**: Frames per second
+- **Total Frames**: Video length
+- **CFG Scale Factor**: Controls prompt influence strength (1-10)
+- **Distribution Shift**: Controls generation style deviation degree (0-10)
+
+### Advanced Optimization Options
+
+#### GPU Memory Optimization
+- **Chunked Rotary Position Embedding**: Saves GPU memory
+- **Rotary Embedding Chunk Size**: Controls chunk granularity
+- **Clean CUDA Cache**: Promptly frees GPU memory
+
+#### Asynchronous Offloading
+- **CPU Offloading**: Transfers partial computation to CPU
+- **Lazy Loading**: Loads model components on-demand, significantly reduces system memory consumption
+- **Offload Granularity Control**: Fine-grained control of offloading strategies
+
+#### Low-Precision Quantization
+- **Attention Operators**: Flash Attention, Sage Attention, etc.
+- **Quantization Operators**: vLLM, SGL, Q8F, etc.
+- **Precision Modes**: FP8, INT8, BF16, etc.
+
+#### VAE Optimization
+- **Lightweight VAE**: Accelerates decoding process
+- **VAE Tiling Inference**: Reduces memory usage
+
+#### Feature Caching
+- **Tea Cache**: Caches intermediate features to accelerate generation
+- **Cache Threshold**: Controls cache trigger conditions
+- **Key Step Caching**: Writes cache only at key steps
+
+## 🔧 Auto-Configuration Feature
+
+After enabling "Auto-configure Inference Options", the system will automatically optimize parameters based on your hardware configuration:
+
+### GPU Memory Rules
+- **80GB+**: Default configuration, no optimization needed
+- **48GB**: Enable CPU offloading, offload ratio 50%
+- **40GB**: Enable CPU offloading, offload ratio 80%
+- **32GB**: Enable CPU offloading, offload ratio 100%
+- **24GB**: Enable BF16 precision, VAE tiling
+- **16GB**: Enable chunked offloading, rotary embedding chunking
+- **12GB**: Enable cache cleaning, lightweight VAE
+- **8GB**: Enable quantization, lazy loading
+
+### CPU Memory Rules
+- **128GB+**: Default configuration
+- **64GB**: Enable DIT quantization
+- **32GB**: Enable lazy loading
+- **16GB**: Enable full model quantization
+
+## ⚠️ Important Notes
+
+### 🚀 Low-Resource Device Optimization Recommendations
+
+**💡 For devices with insufficient VRAM or performance constraints**:
+
+- **🎯 Model Selection**: Prioritize using distilled version models (StepDistill-CfgDistill)
+- **⚡ Inference Steps**: Recommend setting to 4 steps
+- **🔧 CFG Settings**: Recommend disabling CFG option to improve generation speed
+- **🔄 Auto-Configuration**: Enable "Auto-configure Inference Options"
+
+
+## 📁 File Structure
+
+```
+lightx2v/app/
+├── gradio_demo.py          # English interface demo
+├── gradio_demo_zh.py       # Chinese interface demo
+├── run_gradio.sh          # Startup script
+├── README.md              # Documentation
+├── saved_videos/          # Generated video save directory
+└── inference_logs.log     # Inference logs
+```
+
+## 🎨 Interface Description
+
+### Basic Settings Tab
+- **Input Parameters**: Prompts, resolution, and other basic settings
+- **Video Parameters**: FPS, frame count, CFG, and other video generation parameters
+- **Output Settings**: Video save path configuration
+
+### Advanced Options Tab
+- **GPU Memory Optimization**: Memory management related options
+- **Asynchronous Offloading**: CPU offloading and lazy loading
+- **Low-Precision Quantization**: Various quantization optimization options
+- **VAE Optimization**: Variational Autoencoder optimization
+- **Feature Caching**: Cache strategy configuration
+
+## 🔍 Troubleshooting
+
+### Common Issues
+
+**💡 Tip**: Generally, after enabling "Auto-configure Inference Options", the system will automatically optimize parameter settings based on your hardware configuration, and performance issues usually won't occur. If you encounter problems, please refer to the following solutions:
+
+1. **CUDA Memory Insufficient**
+   - Enable CPU offloading
+   - Reduce resolution
+   - Enable quantization options
+
+2. **System Memory Insufficient**
+   - Enable CPU offloading
+   - Enable lazy loading option
+   - Enable quantization options
+
+3. **Slow Generation Speed**
+   - Reduce inference steps
+   - Enable auto-configuration
+   - Use lightweight models
+   - Enable Tea Cache
+   - Use quantization operators
+   - 💾 **Check if models are stored on SSD**
+
+4. **Slow Model Loading**
+   - 💾 **Migrate models to SSD storage**
+   - Enable lazy loading option
+   - Check disk I/O performance
+   - Consider using NVMe SSD
+
+5. **Poor Video Quality**
+   - Increase inference steps
+   - Increase CFG scale factor
+   - Use 14B models
+   - Optimize prompts
+
+### Log Viewing
+
+```bash
+# View inference logs
+tail -f inference_logs.log
+
+# View GPU usage
+nvidia-smi
+
+# View system resources
+htop
+```
+
+
+**Note**: Please comply with relevant laws and regulations when using videos generated by this tool, and do not use them for illegal purposes.
--- a/docs/EN/source/deploy_guides/deploy_local_windows.md
+++ b/docs/EN/source/deploy_guides/deploy_local_windows.md
-# 本地windows电脑部署
+# Local Windows Deployment Guide

-xxx
+This document provides detailed instructions for deploying LightX2V locally on Windows environments.
+
+## System Requirements
+
+Before getting started, please ensure your system meets the following requirements:
+
+- **Operating System**: Windows 10/11
+- **Graphics Card**: NVIDIA GPU (with CUDA support)
+- **VRAM**: At least 8GB VRAM
+- **Memory**: At least 16GB RAM
+- **Storage**: 20GB+ available disk space
+- **Environment Manager**: Anaconda or Miniconda installed
+- **Network Tools**: Git (for cloning repositories)
+
+## Deployment Steps
+
+### Step 1: Check CUDA Version
+
+First, verify your GPU driver and CUDA version by running the following command in Command Prompt:
+
+```bash
+nvidia-smi
+```
+
+Note the **CUDA Version** displayed in the output, as you'll need to match this version during subsequent installations.
+
+### Step 2: Create Python Environment
+
+Create an isolated conda environment, we recommend using Python 3.12:
+
+```bash
+# Create new environment (using Python 3.12 as example)
+conda create -n lightx2v python=3.12 -y
+
+# Activate environment
+conda activate lightx2v
+```
+
+> 💡 **Tip**: Python 3.10 or higher is recommended for optimal compatibility.
+
+### Step 3: Install PyTorch Framework
+
+#### Method 1: Download Official Wheel Packages (Recommended)
+
+1. Visit the [PyTorch Official Wheel Download Page](https://download.pytorch.org/whl/torch/)
+2. Select the appropriate wheel package, ensuring you match:
+   - **Python Version**: Must match your environment (cp312 means Python 3.12)
+   - **CUDA Version**: Must match your GPU driver
+   - **Platform**: Choose Windows version (win_amd64)
+
+**Example for Python 3.12 + PyTorch 2.6 + CUDA 12.4:**
+
+```
+torch-2.6.0+cu124-cp312-cp312-win_amd64.whl
+```
+
+After downloading, install the packages:
+
+```bash
+# Install PyTorch (replace with actual file path)
+pip install torch-2.6.0+cu124-cp312-cp312-win_amd64.whl
+
+# Install accompanying vision and audio packages
+pip install torchvision==0.21.0 torchaudio==2.6.0
+```
+
+#### Method 2: Direct pip Installation
+
+If you prefer direct installation, use the following command:
+
+```bash
+# Example: CUDA 12.4 version
+pip install torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
+```
+
+### Step 4: Install Windows Version vLLM
+
+Download the corresponding wheel package from the [vllm-windows releases page](https://github.com/SystemPanic/vllm-windows/releases).
+
+**Version Matching Requirements:**
+- Python version must match (e.g., cp312)
+- PyTorch version must match
+- CUDA version must match
+
+**Recommended v0.9.1 Installation:**
+
+```bash
+pip install vllm-0.9.1+cu124-cp312-cp312-win_amd64.whl
+```
+
+> ⚠️ **Note**: Please select the appropriate wheel package filename based on your specific environment.
+
+### Step 5: Install Attention Mechanism Operators
+
+You can choose to install either Flash Attention 2 or SageAttention 2. **SageAttention 2 is strongly recommended**.
+
+#### Option A: Flash Attention 2
+
+```bash
+pip install flash-attn==2.7.2.post1
+```
+
+#### Option B: SageAttention 2 (Recommended)
+
+**Download Sources:**
+- [Windows Version 1](https://github.com/woct0rdho/SageAttention/releases)
+- [Windows Version 2](https://github.com/sdbds/SageAttention-for-windows/releases)
+
+**Version Selection Guidelines:**
+- Python version must match
+- PyTorch version must match
+- **CUDA version can be flexible** (SageAttention doesn't use breaking APIs yet)
+
+**Recommended Installation Version:**
+
+```bash
+pip install sageattention-2.1.1+cu126torch2.6.0-cp312-cp312-win_amd64.whl
+```
+
+**Verify SageAttention Installation:**
+
+After installation, we recommend running a verification script to ensure proper functionality:
+
+> 📝 **Testing**: You can also run the [official test script](https://github.com/woct0rdho/SageAttention/blob/main/tests/test_sageattn.py) for more detailed functionality verification.
+
+### Step 6: Get LightX2V Project Code
+
+Clone the LightX2V project from GitHub and install Windows-specific dependencies:
+
+```bash
+# Clone project code
+git clone https://github.com/ModelTC/LightX2V.git
+
+# Enter project directory
+cd LightX2V
+
+# Install Windows-specific dependencies
+pip install -r requirements_win.txt
+```
+
+> 🔍 **Note**: We use `requirements_win.txt` instead of the standard `requirements.txt` because Windows environments may require specific package versions or additional dependencies.
+
+
+## Troubleshooting
+
+### 1. CUDA Version Mismatch
+
+**Symptoms**: CUDA-related errors occur
+
+**Solutions**:
+- Verify GPU driver supports required CUDA version
+- Re-download matching wheel packages
+- Use `nvidia-smi` to check maximum supported CUDA version
+
+### 2. Dependency Conflicts
+
+**Symptoms**: Package version conflicts or import errors
+
+**Solutions**:
+- Remove existing environment: `conda env remove -n lightx2v`
+- Recreate environment and install dependencies strictly by version requirements
+- Use virtual environments to isolate dependencies for different projects
+
+### 3. Wheel Package Download Issues
+
+**Symptoms**: Slow download speeds or connection failures
+
+**Solutions**:
+- Use download tools or browser for direct downloads
+- Look for domestic mirror sources
+- Check network connections and firewall settings
+
+## Next Steps
+
+After completing the environment setup, you can:
+
+- 📚 Check the [Quick Start Guide](../getting_started/quickstart.md) (skip environment installation steps)
+- 🌐 Use the [Gradio Web Interface](./deploy_gradio.md) for visual operations (skip environment installation steps)
+
+## Version Compatibility Reference
+
+| Component | Recommended Version |
+|-----------|-------------------|
+| Python | 3.12 |
+| PyTorch | 2.6.0+cu124 |
+| vLLM | 0.9.1+cu124 |
+| SageAttention | 2.1.1+cu126torch2.6.0 |
+| CUDA | 12.4+ |
+
+---
+
+💡 **Pro Tip**: If you encounter other issues, we recommend first checking whether all component versions match properly, as most problems stem from version incompatibilities.
--- a/docs/EN/source/deploy_guides/for_low_resource.md
+++ b/docs/EN/source/deploy_guides/for_low_resource.md
-# 低资源场景部署
+# Lightx2v Low-Resource Deployment Guide

-xxx
+## 📋 Overview
+
+This guide is specifically designed for hardware resource-constrained environments, particularly configurations with **8GB VRAM + 16/32GB RAM**, providing detailed instructions on how to successfully run Lightx2v 14B models for 480p and 720p video generation.
+
+Lightx2v is a powerful video generation model, but it requires careful optimization to run smoothly in resource-constrained environments. This guide provides a complete solution from hardware selection to software configuration, ensuring you can achieve the best video generation experience under limited hardware conditions.
+
+## 🎯 Target Hardware Configuration
+
+### Recommended Hardware Specifications
+
+**GPU Requirements**:
+- **VRAM**: 8GB (RTX 3060/3070/4060/4060Ti, etc.)
+- **Architecture**: NVIDIA graphics cards with CUDA support
+
+**System Memory**:
+- **Minimum**: 16GB DDR4
+- **Recommended**: 32GB DDR4/DDR5
+- **Memory Speed**: 3200MHz or higher recommended
+
+**Storage Requirements**:
+- **Type**: NVMe SSD strongly recommended
+- **Capacity**: At least 50GB available space
+- **Speed**: Read speed of 3000MB/s or higher recommended
+
+**CPU Requirements**:
+- **Cores**: 8 cores or more recommended
+- **Frequency**: 3.0GHz or higher recommended
+- **Architecture**: Support for AVX2 instruction set
+
+## ⚙️ Core Optimization Strategies
+
+### 1. Environment Optimization
+
+Before running Lightx2v, it's recommended to set the following environment variables to optimize performance:
+
+```bash
+# CUDA memory allocation optimization
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+# Enable CUDA Graph mode to improve inference performance
+export ENABLE_GRAPH_MODE=true
+
+# Use BF16 precision for inference to reduce VRAM usage (default FP32 precision)
+export DTYPE=BF16
+```
+
+**Optimization Details**:
+- `expandable_segments:True`: Allows dynamic expansion of CUDA memory segments, reducing memory fragmentation
+- `ENABLE_GRAPH_MODE=true`: Enables CUDA Graph to reduce kernel launch overhead
+- `DTYPE=BF16`: Uses BF16 precision to reduce VRAM usage while maintaining quality
+
+### 2. Quantization Strategy
+
+Quantization is a key optimization technique in low-resource environments, reducing memory usage by lowering model precision.
+
+#### Quantization Scheme Comparison
+
+**FP8 Quantization** (Recommended for RTX 40 series):
+```python
+# Suitable for GPUs supporting FP8, providing better precision
+dit_quant_scheme = "fp8"      # DIT model quantization
+t5_quant_scheme = "fp8"       # T5 text encoder quantization
+clip_quant_scheme = "fp8"     # CLIP visual encoder quantization
+```
+
+**INT8 Quantization** (Universal solution):
+```python
+# Suitable for all GPUs, minimal memory usage
+dit_quant_scheme = "int8"     # 8-bit integer quantization
+t5_quant_scheme = "int8"      # Text encoder quantization
+clip_quant_scheme = "int8"    # Visual encoder quantization
+```
+
+### 3. Efficient Operator Selection Guide
+
+Choosing the right operators can significantly improve inference speed and reduce memory usage.
+
+#### Attention Operator Selection
+
+**Recommended Priority**:
+1. **[Sage Attention](https://github.com/thu-ml/SageAttention)** (Highest priority)
+
+2. **[Flash Attention](https://github.com/Dao-AILab/flash-attention)** (Universal solution)
+
+#### Matrix Multiplication Operator Selection
+
+**ADA Architecture GPUs** (RTX 40 series):
+
+Recommended priority:
+1. **[q8-kernel](https://github.com/KONAKONA666/q8_kernels)** (Highest performance, ADA architecture only)
+2. **[sglang-kernel](https://github.com/sgl-project/sglang/tree/main/sgl-kernel)** (Balanced solution)
+3. **[vllm-kernel](https://github.com/vllm-project/vllm)** (Universal solution)
+
+**Other Architecture GPUs**:
+1. **[sglang-kernel](https://github.com/sgl-project/sglang/tree/main/sgl-kernel)** (Recommended)
+2. **[vllm-kernel](https://github.com/vllm-project/vllm)** (Alternative)
+
+### 4. Parameter Offloading Strategy
+
+Parameter offloading technology allows models to dynamically schedule parameters between CPU and disk, breaking through VRAM limitations.
+
+#### Three-Level Offloading Architecture
+
+```python
+# Disk-CPU-GPU three-level offloading configuration
+cpu_offload=True             # Enable CPU offloading
+t5_cpu_offload=True          # Enable T5 encoder CPU offloading
+offload_granularity=phase    # DIT model fine-grained offloading
+t5_offload_granularity=block # T5 encoder fine-grained offloading
+lazy_load = True             # Enable lazy loading mechanism
+num_disk_workers = 2         # Disk I/O worker threads
+```
+
+#### Offloading Strategy Details
+
+**Lazy Loading Mechanism**:
+- Model parameters are loaded from disk to CPU on demand
+- Reduces runtime memory usage
+- Supports large models running with limited memory
+
+**Disk Storage Optimization**:
+- Use high-speed SSD to store model parameters
+- Store model files grouped by blocks
+- Refer to conversion script [documentation](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme.md), specify `--save_by_block` parameter during conversion
+
+### 5. VRAM Optimization Techniques
+
+VRAM optimization strategies for 720p video generation.
+
+#### CUDA Memory Management
+
+```python
+# CUDA memory cleanup configuration
+clean_cuda_cache = True        # Timely cleanup of GPU cache
+rotary_chunk = True            # Rotary position encoding chunked computation
+rotary_chunk_size = 100        # Chunk size, adjustable based on VRAM
+```
+
+#### Chunked Computation Strategy
+
+**Rotary Position Encoding Chunking**:
+- Process long sequences in small chunks
+- Reduce peak VRAM usage
+- Maintain computational precision
+
+### 6. VAE Optimization
+
+VAE (Variational Autoencoder) is a key component in video generation, and optimizing VAE can significantly improve performance.
+
+#### VAE Chunked Inference
+
+```python
+# VAE optimization configuration
+use_tiling_vae = True          # Enable VAE chunked inference
+```
+
+#### [Lightweight VAE](https://github.com/madebyollin/taehv/blob/main/taew2_1.pth)
+
+```python
+# VAE optimization configuration
+use_tiny_vae = True            # Use lightweight VAE
+```
+
+**VAE Optimization Effects**:
+- Standard VAE: Baseline performance, 100% quality retention
+- Standard VAE chunked: Reduces VRAM usage, increases inference time, 100% quality retention
+- Lightweight VAE: Extremely low VRAM usage, video quality loss
+
+### 7. Model Selection Strategy
+
+Choosing the right model version is crucial for low-resource environments.
+
+#### Recommended Model Comparison
+
+**Distilled Models** (Strongly recommended):
+- ✅ **[Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v)**
+
+- ✅ **[Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v)**
+
+#### Performance Optimization Suggestions
+
+When using the above distilled models, you can further optimize performance:
+- Disable CFG: `"enable_cfg": false`
+- Reduce inference steps: `infer_step: 4`
+- Reference configuration files: [config](https://github.com/ModelTC/LightX2V/tree/main/configs/distill)
+
+## 🚀 Complete Configuration Examples
+
+### Pre-configured Templates
+
+- **[14B Model 480p Video Generation Configuration](https://github.com/ModelTC/lightx2v/tree/main/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json)**
+
+- **[14B Model 720p Video Generation Configuration](https://github.com/ModelTC/lightx2v/tree/main/configs/offload/disk/wan_i2v_phase_lazy_load_720p.json)**
+
+- **[1.3B Model 720p Video Generation Configuration](https://github.com/ModelTC/LightX2V/tree/main/configs/offload/block/wan_t2v_1_3b.json)**
+  - The inference bottleneck for 1.3B models is the T5 encoder, so the configuration file specifically optimizes for T5
+
+**[Launch Script](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_lazy_load.sh)**
+
+## 📚 Reference Resources
+
+- [Parameter Offloading Mechanism Documentation](../method_tutorials/offload.md) - In-depth understanding of offloading technology principles
+- [Quantization Technology Guide](../method_tutorials/quantization.md) - Detailed explanation of quantization technology
+- [Gradio Deployment Guide](deploy_gradio.md) - Detailed Gradio deployment instructions
+
+## ⚠️ Important Notes
+
+1. **Hardware Requirements**: Ensure your hardware meets minimum configuration requirements
+2. **Driver Version**: Recommend using the latest NVIDIA drivers (535+)
+3. **CUDA Version**: Ensure CUDA version is compatible with PyTorch (recommend CUDA 11.8+)
+4. **Storage Space**: Reserve sufficient disk space for model caching (at least 50GB)
+5. **Network Environment**: Stable network connection required for initial model download
+6. **Environment Variables**: Be sure to set the recommended environment variables to optimize performance
+
+**Technical Support**: If you encounter issues, please submit an Issue to the project repository.
--- a/docs/EN/source/deploy_guides/lora_deploy.md
+++ b/docs/EN/source/deploy_guides/lora_deploy.md
-# Lora模型部署
+# LoRA Model Deployment and Related Tools

-xxx
+LoRA (Low-Rank Adaptation) is an efficient model fine-tuning technique that significantly reduces the number of trainable parameters through low-rank matrix decomposition. LightX2V fully supports LoRA technology, including LoRA inference, LoRA extraction, and LoRA merging functions.
+
+## 🎯 LoRA Technical Features
+
+- **Efficient Fine-tuning**: Dramatically reduces training parameters through low-rank adaptation
+- **Flexible Deployment**: Supports dynamic loading and removal of LoRA weights
+- **Multiple Formats**: Supports various LoRA weight formats and naming conventions
+- **Comprehensive Tools**: Provides complete LoRA extraction and merging toolchain
+
+## 📜 LoRA Inference Deployment
+
+### Configuration File Method
+
+Specify LoRA path in configuration file:
+
+```json
+{
+  "lora_configs": [
+    {
+      "path": "/path/to/your/lora.safetensors",
+      "strength": 1.0
+    }
+  ]
+}
+```
+
+**Configuration Parameter Description:**
+
+- `lora_path`: LoRA weight file path list, supports loading multiple LoRAs simultaneously
+- `strength_model`: LoRA strength coefficient (alpha), controls LoRA's influence on the original model
+
+### Command Line Method
+
+Specify LoRA path directly in command line (supports loading single LoRA only):
+
+```bash
+python -m lightx2v.infer \
+  --model_cls wan2.1 \
+  --task t2v \
+  --model_path /path/to/model \
+  --config_json /path/to/config.json \
+  --lora_path /path/to/your/lora.safetensors \
+  --lora_strength 0.8 \
+  --prompt "Your prompt here"
+```
+
+### Multiple LoRAs Configuration
+
+To use multiple LoRAs with different strengths, specify them in the config JSON file:
+
+```json
+{
+  "lora_configs": [
+    {
+      "path": "/path/to/first_lora.safetensors",
+      "strength": 0.8
+    },
+    {
+      "path": "/path/to/second_lora.safetensors",
+      "strength": 0.5
+    }
+  ]
+}
+```
+
+### Supported LoRA Formats
+
+LightX2V supports multiple LoRA weight naming conventions:
+
+| Format Type | Weight Naming | Description |
+|-------------|---------------|-------------|
+| **Standard LoRA** | `lora_A.weight`, `lora_B.weight` | Standard LoRA matrix decomposition format |
+| **Down/Up Format** | `lora_down.weight`, `lora_up.weight` | Another common naming convention |
+| **Diff Format** | `diff` | `weight` difference values |
+| **Bias Diff** | `diff_b` | `bias` weight difference values |
+| **Modulation Diff** | `diff_m` | `modulation` weight difference values |
+
+### Inference Script Examples
+
+**Step Distillation LoRA Inference:**
+
+```bash
+# T2V LoRA Inference
+bash scripts/wan/run_wan_t2v_distill_4step_cfg_lora.sh
+
+# I2V LoRA Inference
+bash scripts/wan/run_wan_i2v_distill_4step_cfg_lora.sh
+```
+
+**Audio-Driven LoRA Inference:**
+
+```bash
+bash scripts/wan/run_wan_i2v_audio.sh
+```
+
+### Using LoRA in API Service
+
+Specify through [config file](wan_t2v_distill_4step_cfg_lora.json), modify the startup command in [scripts/server/start_server.sh](https://github.com/ModelTC/lightx2v/blob/main/scripts/server/start_server.sh):
+
+```bash
+python -m lightx2v.api_server \
+  --model_cls wan2.1_distill \
+  --task t2v \
+  --model_path $model_path \
+  --config_json ${lightx2v_path}/configs/distill/wan_t2v_distill_4step_cfg_lora.json \
+  --port 8000 \
+  --nproc_per_node 1
+```
+
+## 🔧 LoRA Extraction Tool
+
+Use `tools/extract/lora_extractor.py` to extract LoRA weights from the difference between two models.
+
+### Basic Usage
+
+```bash
+python tools/extract/lora_extractor.py \
+  --source-model /path/to/base/model \
+  --target-model /path/to/finetuned/model \
+  --output /path/to/extracted/lora.safetensors \
+  --rank 32
+```
+
+### Parameter Description
+
+| Parameter | Type | Required | Default | Description |
+|-----------|------|----------|---------|-------------|
+| `--source-model` | str | ✅ | - | Base model path |
+| `--target-model` | str | ✅ | - | Fine-tuned model path |
+| `--output` | str | ✅ | - | Output LoRA file path |
+| `--source-type` | str | ❌ | `safetensors` | Base model format (`safetensors`/`pytorch`) |
+| `--target-type` | str | ❌ | `safetensors` | Fine-tuned model format (`safetensors`/`pytorch`) |
+| `--output-format` | str | ❌ | `safetensors` | Output format (`safetensors`/`pytorch`) |
+| `--rank` | int | ❌ | `32` | LoRA rank value |
+| `--output-dtype` | str | ❌ | `bf16` | Output data type |
+| `--diff-only` | bool | ❌ | `False` | Save weight differences only, without LoRA decomposition |
+
+### Advanced Usage Examples
+
+**Extract High-Rank LoRA:**
+
+```bash
+python tools/extract/lora_extractor.py \
+  --source-model /path/to/base/model \
+  --target-model /path/to/finetuned/model \
+  --output /path/to/high_rank_lora.safetensors \
+  --rank 64 \
+  --output-dtype fp16
+```
+
+**Save Weight Differences Only:**
+
+```bash
+python tools/extract/lora_extractor.py \
+  --source-model /path/to/base/model \
+  --target-model /path/to/finetuned/model \
+  --output /path/to/weight_diff.safetensors \
+  --diff-only
+```
+
+## 🔀 LoRA Merging Tool
+
+Use `tools/extract/lora_merger.py` to merge LoRA weights into the base model for subsequent quantization and other operations.
+
+### Basic Usage
+
+```bash
+python tools/extract/lora_merger.py \
+  --source-model /path/to/base/model \
+  --lora-model /path/to/lora.safetensors \
+  --output /path/to/merged/model.safetensors \
+  --alpha 1.0
+```
+
+### Parameter Description
+
+| Parameter | Type | Required | Default | Description |
+|-----------|------|----------|---------|-------------|
+| `--source-model` | str | ✅ | - | Base model path |
+| `--lora-model` | str | ✅ | - | LoRA weights path |
+| `--output` | str | ✅ | - | Output merged model path |
+| `--source-type` | str | ❌ | `safetensors` | Base model format |
+| `--lora-type` | str | ❌ | `safetensors` | LoRA weights format |
+| `--output-format` | str | ❌ | `safetensors` | Output format |
+| `--alpha` | float | ❌ | `1.0` | LoRA merge strength |
+| `--output-dtype` | str | ❌ | `bf16` | Output data type |
+
+### Advanced Usage Examples
+
+**Partial Strength Merging:**
+
+```bash
+python tools/extract/lora_merger.py \
+  --source-model /path/to/base/model \
+  --lora-model /path/to/lora.safetensors \
+  --output /path/to/merged_model.safetensors \
+  --alpha 0.7 \
+  --output-dtype fp32
+```
+
+**Multi-Format Support:**
+
+```bash
+python tools/extract/lora_merger.py \
+  --source-model /path/to/base/model.pt \
+  --source-type pytorch \
+  --lora-model /path/to/lora.safetensors \
+  --lora-type safetensors \
+  --output /path/to/merged_model.safetensors \
+  --output-format safetensors \
+  --alpha 1.0
+```
--- a/docs/EN/source/deploy_guides/model_structure.md
+++ b/docs/EN/source/deploy_guides/model_structure.md
+# Model Structure Introduction
+
+## 📖 Overview
+
+This document introduces the model directory structure of the Lightx2v project, helping users correctly organize model files for a convenient user experience. Through proper directory organization, users can enjoy the convenience of "one-click startup" without manually configuring complex path parameters.
+
+## 🗂️ Model Directory Structure
+
+### Lightx2v Official Model List
+
+View all available models: [Lightx2v Official Model Repository](https://huggingface.co/lightx2v)
+
+### Standard Directory Structure
+
+Using `Wan2.1-I2V-14B-480P-Lightx2v` as an example:
+
+```
+Model Root Directory/
+├── Wan2.1-I2V-14B-480P-Lightx2v/
+│   ├── config.json                                    # Model configuration file
+│   ├── Wan2.1_VAE.pth                                # VAE variational autoencoder
+│   ├── models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth  # CLIP visual encoder (FP16)
+│   ├── models_t5_umt5-xxl-enc-bf16.pth               # T5 text encoder (BF16)
+│   ├── taew2_1.pth                                   # Lightweight VAE (optional)
+│   ├── fp8/                                          # FP8 quantized version (DIT/T5/CLIP)
+│   ├── int8/                                         # INT8 quantized version (DIT/T5/CLIP)
+│   ├── original/                                     # Original precision version (DIT)
+│   ├── xlm-roberta-large/
+│   └── google/
+```
+
+### 💾 Storage Recommendations
+
+**Strongly recommend storing model files on SSD solid-state drives** to significantly improve model loading speed and inference performance.
+
+**Recommended storage paths**:
+```bash
+/mnt/ssd/models/          # Independent SSD mount point
+/data/ssd/models/         # Data SSD directory
+/opt/models/              # System optimization directory
+```
+
+### Quantized Version Directories
+
+Each model contains multiple quantized versions for different hardware configurations:
+
+```
+Model Directory/
+├── fp8/                         # FP8 quantized version (H100/A100 high-end GPUs)
+├── int8/                        # INT8 quantized version (general GPUs)
+└── original/                    # Original precision version (DIT)
+```
+
+**💡 Using Full Precision Models**: To use full precision models, simply copy the official weight files to the `original/` directory.
+
+## 🚀 Usage Methods
+
+### Gradio Interface Startup
+
+When using the Gradio interface, simply specify the model root directory path:
+
+```bash
+# Image to Video (I2V)
+python gradio_demo_zh.py \
+    --model_path /path/to/Wan2.1-I2V-14B-480P-Lightx2v \
+    --model_size 14b \
+    --task i2v
+
+# Text to Video (T2V)
+python gradio_demo_zh.py \
+    --model_path /path/to/Wan2.1-T2V-14B-Lightx2v \
+    --model_size 14b \
+    --task t2v
+```
+
+### Configuration File Startup
+
+When starting with configuration files, such as [configuration file](https://github.com/ModelTC/LightX2V/tree/main/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json), the following path configurations can be omitted:
+
+- `dit_quantized_ckpt`: No need to specify, code will automatically search in the model directory
+- `tiny_vae_path`: No need to specify, code will automatically search in the model directory
+- `clip_quantized_ckpt`: No need to specify, code will automatically search in the model directory
+- `t5_quantized_ckpt`: No need to specify, code will automatically search in the model directory
+
+**💡 Simplified Configuration**: After organizing model files according to the recommended directory structure, most path configurations can be omitted as the code will handle them automatically.
+
+### Manual Download
+
+1. Visit the [Hugging Face Model Page](https://huggingface.co/lightx2v)
+2. Select the required model version
+3. Download all files to the corresponding directory
+
+**💡 Download Recommendations**: It is recommended to use SSD storage and ensure stable network connection. For large files, you can use `git lfs` or download tools such as `aria2c`.
+
+## 💡 Best Practices
+
+- **Use SSD Storage**: Significantly improve model loading speed and inference performance
+- **Unified Directory Structure**: Facilitate management and switching between different model versions
+- **Reserve Sufficient Space**: Ensure adequate storage space (recommended at least 200GB)
+- **Regular Cleanup**: Delete unnecessary model versions to save space
+- **Network Optimization**: Use stable network connections and download tools
+
+## 🚨 Common Issues
+
+### Q: Model files are too large and download is slow?
+A: Use domestic mirror sources, download tools such as `aria2c`, or consider using cloud storage services
+
+### Q: Model path not found when starting?
+A: Check if the model has been downloaded correctly and verify the path configuration
+
+### Q: How to switch between different model versions?
+A: Modify the model path parameter in the startup command, supports running multiple model instances simultaneously
+
+### Q: Model loading is very slow?
+A: Ensure models are stored on SSD, enable lazy loading, and use quantized version models
+
+### Q: How to set paths in configuration files?
+A: After organizing according to the recommended directory structure, most path configurations can be omitted as the code will handle them automatically
+
+## 📚 Related Links
+
+- [Lightx2v Official Model Repository](https://huggingface.co/lightx2v)
+- [Gradio Deployment Guide](./deploy_gradio.md)
+
+---
+
+Through proper model file organization, users can enjoy the convenience of "one-click startup" without manually configuring complex path parameters. It is recommended to organize model files according to the structure recommended in this document and fully utilize the advantages of SSD storage.
--- a/docs/EN/source/getting_started/benchmark.md
+++ b/docs/EN/source/getting_started/benchmark.md
+# Benchmark
+
+For a better display of video playback effects and detailed performance comparisons, you can get better presentation and corresponding documentation content on this [🔗 page](https://github.com/ModelTC/LightX2V/blob/main/docs/EN/source/getting_started/benchmark_source.md).
--- a/docs/EN/source/getting_started/benchmark_source.md
+++ b/docs/EN/source/getting_started/benchmark_source.md
+# Benchmark
+
+---
+
+## H200 (~140GB VRAM)
+
+**Software Environment:**
+- Python 3.11
+- PyTorch 2.7.1+cu128
+- SageAttention 2.2.0
+- vLLM 0.9.2
+- sgl-kernel 0.1.8
+
+### 480P 5s Video
+
+**Test Configuration:**
+- **Model**: [Wan2.1-I2V-14B-480P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-Lightx2v)
+- **Parameters**: infer_steps=40, seed=42, enable_cfg=True
+
+#### Performance Comparison
+
+| Configuration | Model Load Time(s) | Inference Time(s) | GPU Memory(GB) | Speedup | Video Effect |
+|:-------------|:------------------:|:-----------------:|:--------------:|:-------:|:------------:|
+| Wan2.1 Official(baseline) | 68.26 | 366.04 | 71 | 1.0x | <video src="PATH_TO_BASELINE_480P_VIDEO" width="200px"></video> |
+| **LightX2V_1** | 37.28 | 249.54 | 53 | **1.47x** | <video src="PATH_TO_LIGHTX2V_1_480P_VIDEO" width="200px"></video> |
+| **LightX2V_2** | 37.24 | 216.16 | 50 | **1.69x** | <video src="PATH_TO_LIGHTX2V_2_480P_VIDEO" width="200px"></video> |
+| **LightX2V_3** | 23.62 | 190.73 | 35 | **1.92x** | <video src="PATH_TO_LIGHTX2V_3_480P_VIDEO" width="200px"></video> |
+| **LightX2V_4** | 23.62 | 107.19 | 35 | **3.41x** | <video src="PATH_TO_LIGHTX2V_4_480P_VIDEO" width="200px"></video> |
+| **LightX2V_4-Distill** | 23.62 | 107.19 | 35 | **3.41x** | <video src="PATH_TO_LIGHTX2V_4_DISTILL_480P_VIDEO" width="200px"></video> |
+
+### 720P 5s Video
+
+**Test Configuration:**
+- **Model**: [Wan2.1-I2V-14B-720P-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-Lightx2v)
+- **Parameters**: infer_steps=40, seed=42, enable_cfg=True
+
+*Coming soon...*
+
+---
+
+## RTX 4090 (~24GB VRAM)
+
+### 480P 5s Video
+
+*Coming soon...*
+
+### 720P 5s Video
+
+*Coming soon...*
+
+---
+
+## Table Descriptions
+
+- **Wan2.1 Official(baseline)**: Baseline implementation based on [Wan2.1 official repository](https://github.com/Wan-Video/Wan2.1)
+- **LightX2V_1**: Uses SageAttention2 to replace native attention mechanism with DIT BF16+FP32 mixed precision (sensitive layers), improving computational efficiency while maintaining precision
+- **LightX2V_2**: Unified BF16 precision computation to further reduce memory usage and computational overhead while maintaining generation quality
+- **LightX2V_3**: Quantization optimization introducing FP8 quantization technology to significantly reduce computational precision requirements, combined with Tiling VAE technology to optimize memory usage
+- **LightX2V_4**: Ultimate optimization adding TeaCache (teacache_thresh=0.2) caching reuse technology on top of LightX2V_3 to achieve maximum acceleration by intelligently skipping redundant computations
+- **LightX2V_4-Distill**: Building on LightX2V_4 with 4-step distilled model ([Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v))
--- a/docs/EN/source/getting_started/quickstart.md
+++ b/docs/EN/source/getting_started/quickstart.md
@@ -18,9 +18,8 @@ git clone https://github.com/ModelTC/lightx2v.git lightx2v && cd lightx2v
 conda create -n lightx2v python=3.11 && conda activate lightx2v
 pip install -r requirements.txt

-# Install again separately to bypass the version conflict check
 # The Hunyuan model needs to run under this version of transformers. If you do not need to run the Hunyuan model, you can ignore this step.
-pip install transformers==4.45.2
+# pip install transformers==4.45.2

 # install flash-attention 2
 git clone https://github.com/Dao-AILab/flash-attention.git --recursive
@@ -34,7 +33,7 @@ cd flash-attention/hopper && python setup.py install

 ```shell
 # Modify the path in the script
-bash scripts/run_wan_t2v.sh
+bash scripts/wan/run_wan_t2v.sh
 ```

-In addition to the existing input arguments in the script, there are also some necessary parameters in the `${lightx2v_path}/configs/wan_t2v.json` file specified by `--config_json`. You can modify them as needed.
+In addition to the existing input arguments in the script, there are also some necessary parameters in the `wan_t2v.json` file specified by `--config_json`. You can modify them as needed.
--- a/docs/EN/source/index.rst
+++ b/docs/EN/source/index.rst
@@ -2,17 +2,32 @@ Welcome to Lightx2v!
 ==================

 .. figure:: ../../../assets/img_lightx2v.png
-  :width: 100%
+  :width: 80%
  :align: center
  :alt: Lightx2v
  :class: no-scaled-link

 .. raw:: html

-   <p style="text-align:center">
-   <strong>A Light Video Generation Inference Framework
-   </strong>
+    <div align="center" style="font-family: charter;">

+    <a href="https://opensource.org/licenses/Apache-2.0"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License"></a>
+    <a href="https://deepwiki.com/ModelTC/lightx2v"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"></a>
+    <a href="https://lightx2v-en.readthedocs.io/en/latest"><img src="https://img.shields.io/badge/docs-English-99cc2" alt="Doc"></a>
+    <a href="https://lightx2v-zhcn.readthedocs.io/zh-cn/latest"><img src="https://img.shields.io/badge/文档-中文-99cc2" alt="Doc"></a>
+    <a href="https://hub.docker.com/r/lightx2v/lightx2v/tags"><img src="https://badgen.net/badge/icon/docker?icon=docker&label" alt="Docker"></a>
+
+    </div>
+
+    <div align="center" style="font-family: charter;">
+    <strong>LightX2V: Light Video Generation Inference Framework</strong>
+    </div>
+
+LightX2V is a lightweight video generation inference framework designed to provide an inference tool that leverages multiple advanced video generation inference techniques. As a unified inference platform, this framework supports various generation tasks such as text-to-video (T2V) and image-to-video (I2V) across different models. X2V means transforming different input modalities (such as text or images) to video output.
+
+GitHub: https://github.com/ModelTC/lightx2v
+
+HuggingFace: https://huggingface.co/lightx2v

 Documentation
 -------------
@@ -22,6 +37,7 @@ Documentation
   :caption: Quick Start

   Quick Start <getting_started/quickstart.md>
+   Benchmark <getting_started/benchmark.md>

 .. toctree::
   :maxdepth: 1
@@ -30,8 +46,9 @@ Documentation
   Model Quantization <method_tutorials/quantization.md>
   Feature Caching <method_tutorials/cache.md>
   Attention Module <method_tutorials/attention.md>
-   Offloading <method_tutorials/offload.md>
+   Offload <method_tutorials/offload.md>
   Parallel Inference <method_tutorials/parallel.md>
+   Changing Resolution Inference <method_tutorials/changing_resolution.md>
   Step Distill <method_tutorials/step_distill.md>
   Autoregressive Distill <method_tutorials/autoregressive_distill.md>

@@ -39,6 +56,7 @@ Documentation
   :maxdepth: 1
   :caption: Deployment Guides

+   Model Structure <deploy_guides/model_structure.md>
   Low Latency Deployment <deploy_guides/for_low_latency.md>
   Low Resource Deployment <deploy_guides/for_low_resource.md>
   Lora Deployment <deploy_guides/lora_deploy.md>
@@ -46,10 +64,3 @@ Documentation
   Gradio Deployment <deploy_guides/deploy_gradio.md>
   ComfyUI Deployment <deploy_guides/deploy_comfyui.md>
   Local Windows Deployment <deploy_guides/deploy_local_windows.md>
-
-
-.. Indices and tables
-.. ==================
-
-.. * :ref:`genindex`
-.. * :ref:`modindex`
--- a/docs/EN/source/method_tutorials/attention.md
+++ b/docs/EN/source/method_tutorials/attention.md
-# 🎯 Attention Type Configuration in DiT Model
+# Attention Mechanisms

-The DiT model in `LightX2V` currently uses three types of attention mechanisms. Each type of attention can be configured with a specific backend library.
+## Attention Mechanisms Supported by LightX2V

---
-
-## Attention Usage Locations
-
-1. **Self-Attention on the image**
-   - Configuration key: `self_attn_1_type`
-
-2. **Cross-Attention between image and prompt text**
-   - Configuration key: `cross_attn_1_type`
-
-3. **Cross-Attention between image and reference image (in I2V mode)**
-   - Configuration key: `cross_attn_2_type`
+| Name               | Type Name        | GitHub Link |
+|--------------------|------------------|-------------|
+| Flash Attention 2  | `flash_attn2`    | [flash-attention v2](https://github.com/Dao-AILab/flash-attention) |
+| Flash Attention 3  | `flash_attn3`    | [flash-attention v3](https://github.com/Dao-AILab/flash-attention) |
+| Sage Attention 2   | `sage_attn2`     | [SageAttention](https://github.com/thu-ml/SageAttention) |
+| Radial Attention   | `radial_attn`    | [Radial Attention](https://github.com/mit-han-lab/radial-attention) |
+| Sparge Attention   | `sparge_ckpt`     | [Sparge Attention](https://github.com/thu-ml/SpargeAttn) |

 ---

-## 🚀 Supported Attention Backends
+## Configuration Examples

-| Name               | Type Identifier   | GitHub Link |
-|--------------------|-------------------|-------------|
-| Flash Attention 2  | `flash_attn2`     | [flash-attention v2](https://github.com/Dao-AILab/flash-attention) |
-| Flash Attention 3  | `flash_attn3`     | [flash-attention v3](https://github.com/Dao-AILab/flash-attention) |
-| Sage Attention 2   | `sage_attn2`      | [SageAttention](https://github.com/thu-ml/SageAttention) |
-| Radial Attention   | `radial_attn`     | [Radial Attention](https://github.com/mit-han-lab/radial-attention) |
-| Sparge Attention   | `sparge_ckpt`     | [Sparge Attention](https://github.com/thu-ml/SpargeAttn) |
-
---
+The configuration files for attention mechanisms are located [here](https://github.com/ModelTC/lightx2v/tree/main/configs/attentions)

-## 🛠️ Configuration Example
+By specifying --config_json to a specific config file, you can test different attention mechanisms.

-In the `wan_i2v.json` configuration file, you can specify the attention types as follows:
+For example, for radial_attn, the configuration is as follows:

 ```json
 {
@@ -41,26 +28,8 @@ In the `wan_i2v.json` configuration file, you can specify the attention types as
 }
 ```

-To use other attention backends, simply replace the values with the appropriate type identifiers listed above.
-
-Tip: Due to the limitations of the sparse algorithm's principle, radial_attn can only be used in self-attention.
-
---
-
-For Sparge Attention like `wan_t2v_sparge.json` configuration file:
-
-   Sparge Attention need PostTrain weight path
+To switch to other types, simply replace the corresponding values with the type names from the table above.

-```json
-{
-  "self_attn_1_type": "flash_attn3",
-  "cross_attn_1_type": "flash_attn3",
-  "cross_attn_2_type": "flash_attn3"
-  "sparge": true,
-  "sparge_ckpt": "/path/to/sparge_wan2.1_t2v_1.3B.pt"
-}
-```
-
---
+Tips: radial_attn can only be used in self attention due to the limitations of its sparse algorithm principle.

-For further customization or behavior tuning, please refer to the official documentation of the respective attention libraries.
+For further customization of attention mechanism behavior, please refer to the official documentation or implementation code of each attention library.
--- a/docs/EN/source/method_tutorials/autoregressive_distill.md
+++ b/docs/EN/source/method_tutorials/autoregressive_distill.md
-# 自回归蒸馏
+# Autoregressive Distillation

-xxx
+Autoregressive distillation is a technical exploration in LightX2V. By training distilled models, it reduces inference steps from the original 40-50 steps to **8 steps**, achieving inference acceleration while enabling infinite-length video generation through KV Cache technology.
+
+> ⚠️ Warning: Currently, autoregressive distillation has mediocre effects and the acceleration improvement has not met expectations, but it can serve as a long-term research project. Currently, LightX2V only supports autoregressive models for T2V.
+
+## 🔍 Technical Principle
+
+Autoregressive distillation is implemented through [CausVid](https://github.com/tianweiy/CausVid) technology. CausVid performs step distillation and CFG distillation on 1.3B autoregressive models. LightX2V extends it with a series of enhancements:
+
+1. **Larger Models**: Supports autoregressive distillation training for 14B models;
+2. **More Complete Data Processing Pipeline**: Generates a training dataset of 50,000 prompt-video pairs;
+
+For detailed implementation, refer to [CausVid-Plus](https://github.com/GoatWu/CausVid-Plus).
+
+## 🛠️ Configuration Files
+
+### Configuration File
+
+Configuration options are provided in the [configs/causvid/](https://github.com/ModelTC/lightx2v/tree/main/configs/causvid) directory:
+
+| Configuration File | Model Address |
+|-------------------|---------------|
+| [wan_t2v_causvid.json](https://github.com/ModelTC/lightx2v/blob/main/configs/causvid/wan_t2v_causvid.json) | https://huggingface.co/lightx2v/Wan2.1-T2V-14B-CausVid |
+
+### Key Configuration Parameters
+
+```json
+{
+  "enable_cfg": false,          // Disable CFG for speed improvement
+  "num_fragments": 3,           // Number of video segments generated at once, 5s each
+  "num_frames": 21,             // Frames per video segment, modify with caution!
+  "num_frame_per_block": 3,     // Frames per autoregressive block, modify with caution!
+  "num_blocks": 7,              // Autoregressive blocks per video segment, modify with caution!
+  "frame_seq_length": 1560,     // Encoding length per frame, modify with caution!
+  "denoising_step_list": [      // Denoising timestep list
+    999, 934, 862, 756, 603, 410, 250, 140, 74
+  ]
+}
+```
+
+## 📜 Usage
+
+### Model Preparation
+
+Place the downloaded model (`causal_model.pt` or `causal_model.safetensors`) in the `causvid_models/` folder under the Wan model root directory:
+- For T2V: `Wan2.1-T2V-14B/causvid_models/`
+
+### Inference Script
+
+```bash
+bash scripts/wan/run_wan_t2v_causvid.sh
+```