logger.info(f"Found Hugging Face model files in: {path}")
returnpath
raiseFileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
logger.info(f"Found PyTorch model checkpoint: {path}")
returnpath
raiseFileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
# Device doesn't support fp8, return None (use default precision)
info="💡 **Automatically optimize GPU settings to match the current resolution. After changing the resolution, please re-check this option to prevent potential performance degradation or runtime errors.**",
info="Enable classifier-free guidance to control prompt strength",
)
cfg_scale=gr.Slider(
label="CFG Scale Factor",
minimum=1,
maximum=10,
step=1,
value=5,
info="Controls the influence strength of the prompt. Higher values give more influence to the prompt.",
visible=False,# Hidden, not exposed to frontend
)
withgr.Row():
sample_shift=gr.Slider(
label="Distribution Shift",
value=5,
...
...
@@ -1004,7 +1226,56 @@ def main():
step=1,
info="Controls the degree of distribution shift for samples. Larger values indicate more significant shifts.",
)
cfg_scale=gr.Slider(
label="CFG Scale Factor",
minimum=1,
maximum=10,
step=1,
value=default_cfg_scale,
info="Controls the influence strength of the prompt. Higher values give more influence to the prompt. When value is 1, CFG is automatically disabled.",
)
# Update enable_cfg based on cfg_scale
defupdate_enable_cfg(cfg_scale_val):
"""Automatically set enable_cfg based on cfg_scale value"""
ifcfg_scale_val==1:
returngr.update(value=False)
else:
returngr.update(value=True)
# Dynamically update CFG scale factor and enable_cfg when model path changes
info="When enabled, processes rotary position embeddings in chunks to save GPU memory.",
)
rotary_chunk_size=gr.Slider(
label="Rotary Embedding Chunk Size",
value=100,
minimum=100,
maximum=10000,
step=100,
info="Controls the chunk size for applying rotary embeddings. Larger values may improve performance but increase memory usage. Only effective if 'rotary_chunk' is checked.",
)
unload_modules=gr.Checkbox(
label="Unload Modules",
value=False,
info="Unload modules (T5, CLIP, DIT, etc.) after inference to reduce GPU/CPU memory usage",
)
clean_cuda_cache=gr.Checkbox(
label="Clean CUDA Memory Cache",
value=False,
info="When enabled, frees up GPU memory promptly but slows down inference.",
)
gr.Markdown("### Asynchronous Offloading")
withgr.Row():
cpu_offload=gr.Checkbox(
label="CPU Offloading",
value=False,
info="Offload parts of the model computation from GPU to CPU to reduce GPU memory usage",
)
lazy_load=gr.Checkbox(
label="Enable Lazy Loading",
value=False,
info="Lazy load model components during inference. Requires CPU loading and DIT quantization.",
)
offload_granularity=gr.Dropdown(
label="Dit Offload Granularity",
choices=["block","phase"],
value="phase",
info="Sets Dit model offloading granularity: blocks or computational phases",
)
offload_ratio=gr.Slider(
label="Offload ratio for Dit model",
minimum=0.0,
maximum=1.0,
step=0.1,
value=1.0,
info="Controls how much of the Dit model is offloaded to the CPU",
)
t5_cpu_offload=gr.Checkbox(
label="T5 CPU Offloading",
value=False,
info="Offload the T5 Encoder model to CPU to reduce GPU memory usage",
)
t5_offload_granularity=gr.Dropdown(
label="T5 Encoder Offload Granularity",
choices=["model","block"],
value="model",
info="Controls the granularity when offloading the T5 Encoder model to CPU",
)
gr.Markdown("### Low-Precision Quantization")
withgr.Row():
torch_compile=gr.Checkbox(
label="Torch Compile",
value=False,
info="Use torch.compile to accelerate the inference process",
)
attention_type=gr.Dropdown(
label="Attention Operator",
choices=[op[1]foropinattn_op_choices],
value=attn_op_choices[0][1],
info="Use appropriate attention operators to accelerate inference",
)
quant_op=gr.Dropdown(
label="Quantization Matmul Operator",
choices=[op[1]foropinquant_op_choices],
value=quant_op_choices[0][1],
info="Select the quantization matrix multiplication operator to accelerate inference",
info="Quantization precision for the T5 Encoder model",
)
clip_quant_scheme=gr.Dropdown(
label="Clip Encoder",
choices=quant_options["clip_choices"],
value=quant_options["clip_default"],
info="Quantization precision for the Clip Encoder",
)
precision_mode=gr.Dropdown(
label="Precision Mode for Sensitive Layers",
choices=["fp32","bf16"],
value="fp32",
info="Select the numerical precision for critical model components like normalization and embedding layers. FP32 offers higher accuracy, while BF16 improves performance on compatible hardware.",
show_label=False,
)
gr.Markdown("### Variational Autoencoder (VAE)")
withgr.Row():
use_tae=gr.Checkbox(
label="Use Tiny VAE",
value=False,
info="Use a lightweight VAE model to accelerate the decoding process",
)
use_tiling_vae=gr.Checkbox(
label="VAE Tiling Inference",
value=False,
info="Use VAE tiling inference to reduce GPU memory usage",
)
gr.Markdown("### Feature Caching")
withgr.Row():
enable_teacache=gr.Checkbox(
label="Tea Cache",
value=False,
info="Cache features during inference to reduce the number of inference steps",
)
teacache_thresh=gr.Slider(
label="Tea Cache Threshold",
value=0.26,
minimum=0,
maximum=1,
info="Higher acceleration may result in lower quality —— Setting to 0.1 provides ~2.0x acceleration, setting to 0.2 provides ~3.0x acceleration",
)
use_ret_steps=gr.Checkbox(
label="Cache Only Key Steps",
value=False,
info="When checked, cache is written only at key steps where the scheduler returns results; when unchecked, cache is written at all steps to ensure the highest quality",
help="Model class to use (wan2.1: standard model, wan2.1_distill: distilled model for faster inference)",
)
parser.add_argument("--model_size",type=str,required=True,choices=["14b","1.3b"],help="Model type to use")
parser.add_argument("--task",type=str,required=True,choices=["i2v","t2v"],help="Specify the task type. 'i2v' for image-to-video translation, 't2v' for text-to-video generation.")
logger.info(f"Found Hugging Face model files in: {path}")
returnpath
raiseFileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
logger.info(f"Found PyTorch model checkpoint: {path}")
returnpath
raiseFileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
- ✅ [q8-kernel](https://github.com/KONAKONA666/q8_kernels)(only supports ADA architecture GPUs)
Install according to the project homepage tutorials for each operator as needed
Install according to the project homepage tutorials for each operator as needed.
### 🤖 Supported Models
### 📥 Model Download
#### 🎬 Image-to-Video Models
Refer to the [Model Structure Documentation](../getting_started/model_structure.md) to download complete models (including quantized and non-quantized versions) or download only quantized/non-quantized versions.
| Model Name | Resolution | Parameters | Features | Recommended Use |
| ✅ [Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v](https://huggingface.co/lightx2v/Wan2.1-I2V-14B-720P-StepDistill-CfgDistill-Lightx2v) | 720p | 14B | HD distilled version | High quality + fast inference |
#### wan2.1 Model Directory Structure
#### 📝 Text-to-Video Models
| Model Name | Parameters | Features | Recommended Use |
-**Resource-constrained**: Prioritize distilled versions and lower resolutions
-**Real-time applications**: Strongly recommend using distilled models (`wan2.1_distill`)
**🎯 Model Category Description**:
-**`wan2.1`**: Standard model, provides the best video generation quality, suitable for scenarios with extremely high quality requirements
-**`wan2.1_distill`**: Distilled model, optimized through knowledge distillation technology, significantly improves inference speed, maintains good quality while greatly reducing computation time, suitable for most application scenarios
**📥 Model Download**:
Refer to the [Model Structure Documentation](./model_structure.md) to download complete models (including quantized and non-quantized versions) or download only quantized/non-quantized versions.
**Download Options**:
```
models/
├── wan2.1_i2v_720p_lightx2v_4step.safetensors # Original precision
├── t5/clip/xlm-roberta-large/google # text and image encoder
├── vae/lightvae/lighttae # vae
└── config.json # Model configuration file
```
-**Complete Model**: When downloading complete models with both quantized and non-quantized versions, you can freely choose the quantization precision for DIT/T5/CLIP in the advanced options of the `Gradio` Web frontend.
#### wan2.2 Model Directory Structure
-**Non-quantized Version Only**: When downloading only non-quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to bf16/fp16. If you need to use quantized versions of models, please manually download quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
```
models/
├── wan2.2_i2v_A14b_high_noise_lightx2v_4step_1030.safetensors # high noise original precision
├── wan2.2_i2v_A14b_high_noise_fp8_e4m3_lightx2v_4step_1030.safetensors # high noise FP8 quantization
├── wan2.2_i2v_A14b_high_noise_int8_lightx2v_4step_1030.safetensors # high noise INT8 quantization
├── wan2.2_i2v_A14b_high_noise_int8_lightx2v_4step_1030_split # high noise INT8 quantization block storage directory
├── wan2.2_i2v_A14b_low_noise_lightx2v_4step.safetensors # low noise original precision
├── t5/clip/xlm-roberta-large/google # text and image encoder
├── vae/lightvae/lighttae # vae
└── config.json # Model configuration file
```
-**Quantized Version Only**: When downloading only quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to fp8 or int8 (depending on the weights you downloaded). If you need to use non-quantized versions of models, please manually download non-quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
**📝 Download Instructions**:
-**Note**: Whether you download complete models or partial models, the values for `i2v_model_path` and `t2v_model_path` parameters should be the first-level directory paths. For example: `Wan2.1-I2V-14B-480P-Lightx2v/`, not `Wan2.1-I2V-14B-480P-Lightx2v/int8`.
- Model weights can be downloaded from HuggingFace:
- Text and Image Encoders can be downloaded from [Encoders](https://huggingface.co/lightx2v/Encoderss)
- VAE can be downloaded from [Autoencoders](https://huggingface.co/lightx2v/Autoencoders)
- For `xxx_split` directories (e.g., `wan2.1_i2v_720p_scaled_fp8_e4m3_lightx2v_4step_split`), which store multiple safetensors by block, suitable for devices with insufficient memory. For example, devices with 16GB or less memory should download according to their own situation.
| `--model_path` | str | ✅ | - | Model root directory path (directory containing all model files) |
| `--server_port` | int | ❌ | 7862 | Server port |
| `--server_name` | str | ❌ | 0.0.0.0 | Server IP address |
| `--output_dir` | str | ❌ | ./outputs | Output video save directory |
**💡 Note**: Model type (wan2.1/wan2.2), task type (i2v/t2v), and specific model file selection are all configured in the Web interface.
## 🎯 Features
### Basic Settings
### Model Configuration
-**Model Type**: Supports wan2.1 and wan2.2 model architectures
-**Task Type**: Supports Image-to-Video (i2v) and Text-to-Video (t2v) generation modes
-**Model Selection**: Frontend automatically identifies and filters available model files, supports automatic quantization precision detection
-**Encoder Configuration**: Supports selection of T5 text encoder, CLIP image encoder, and VAE decoder
-**Operator Selection**: Supports multiple attention operators and quantization matrix multiplication operators, system automatically sorts by installation status
### Input Parameters
#### Input Parameters
-**Prompt**: Describe the expected video content
-**Negative Prompt**: Specify elements you don't want to appear
-**Input Image**: Upload input image required in i2v mode
-**Key Step Caching**: Writes cache only at key steps
The system automatically configures optimal inference options based on your hardware configuration (GPU VRAM and CPU memory) without manual adjustment. The best configuration is automatically applied on startup, including:
## 🔧 Auto-Configuration Feature
-**GPU Memory Optimization**: Automatically enables CPU offloading, VAE tiling inference, etc. based on VRAM size
-**CPU Memory Optimization**: Automatically enables lazy loading, module unloading, etc. based on system memory
-**Operator Selection**: Automatically selects the best installed operators (sorted by priority)
-**Quantization Configuration**: Automatically detects and applies quantization precision based on model file names
After enabling "Auto-configure Inference Options", the system will automatically optimize parameters based on your hardware configuration:
### GPU Memory Rules
-**80GB+**: Default configuration, no optimization needed
-**48GB**: Enable CPU offloading, offload ratio 50%
-**40GB**: Enable CPU offloading, offload ratio 80%
-**32GB**: Enable CPU offloading, offload ratio 100%
**💡 Tip**: Generally, after enabling "Auto-configure Inference Options", the system will automatically optimize parameter settings based on your hardware configuration, and performance issues usually won't occur. If you encounter problems, please refer to the following solutions:
@@ -178,7 +178,7 @@ class WanModel(CompiledMethodsMixin):
ifos.path.exists(non_block_file):
safetensors_files=[non_block_file]
else:
raiseValueError(f"Non-block file not found in {safetensors_path}")
raiseValueError(f"Non-block file not found in {safetensors_path}. Please check the model path. Lazy load mode only supports loading chunked model weights.")
weight_dict={}
forfile_pathinsafetensors_files:
...
...
@@ -221,7 +221,7 @@ class WanModel(CompiledMethodsMixin):
ifos.path.exists(non_block_file):
safetensors_files=[non_block_file]
else:
raiseValueError(f"Non-block file not found in {safetensors_path}, Please check the lazy load model path")
raiseValueError(f"Non-block file not found in {safetensors_path}. Please check the model path. Lazy load mode only supports loading chunked model weights.")