Commit a8fdaaee authored by gushiqiao's avatar gushiqiao Committed by GitHub
Browse files

Update docs and gradio

Update docs and gradio
parents 39a4849a ef43445c
......@@ -11,6 +11,7 @@ from loguru import logger
import importlib.util
import psutil
import random
import glob
logger.add(
"inference_logs.log",
......@@ -24,6 +25,40 @@ logger.add(
MAX_NUMPY_SEED = 2**32 - 1
def find_hf_model_path(model_path, subdir=["original", "fp8", "int8"]):
paths_to_check = [model_path]
if isinstance(subdir, list):
for sub in subdir:
paths_to_check.append(os.path.join(model_path, sub))
else:
paths_to_check.append(os.path.join(model_path, subdir))
for path in paths_to_check:
safetensors_pattern = os.path.join(path, "*.safetensors")
safetensors_files = glob.glob(safetensors_pattern)
if safetensors_files:
logger.info(f"Found Hugging Face model files in: {path}")
return path
raise FileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
def find_torch_model_path(model_path, filename=None, subdir=["original", "fp8", "int8"]):
paths_to_check = [
os.path.join(model_path, filename),
]
if isinstance(subdir, list):
for sub in subdir:
paths_to_check.append(os.path.join(model_path, sub, filename))
else:
paths_to_check.append(os.path.join(model_path, subdir, filename))
print(paths_to_check)
for path in paths_to_check:
if os.path.exists(path):
logger.info(f"Found PyTorch model checkpoint: {path}")
return path
raise FileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
def generate_random_seed():
return random.randint(0, MAX_NUMPY_SEED)
......@@ -152,6 +187,50 @@ def is_ada_architecture_gpu():
return False
def get_quantization_options(model_path):
"""Get quantization options dynamically based on model_path"""
import os
# Check subdirectories
subdirs = ["original", "fp8", "int8"]
has_subdirs = {subdir: os.path.exists(os.path.join(model_path, subdir)) for subdir in subdirs}
# Check original files in root directory
t5_bf16_exists = os.path.exists(os.path.join(model_path, "models_t5_umt5-xxl-enc-bf16.pth"))
clip_fp16_exists = os.path.exists(os.path.join(model_path, "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"))
# Generate options
def get_choices(has_subdirs, original_type, fp8_type, int8_type, fallback_type, has_original_file=False):
choices = []
if has_subdirs["original"]:
choices.append(original_type)
if has_subdirs["fp8"]:
choices.append(fp8_type)
if has_subdirs["int8"]:
choices.append(int8_type)
# If no subdirectories but original file exists, add original type
if not choices and has_original_file:
choices.append(original_type)
# If no options at all, use default value
if not choices:
choices = [fallback_type]
return choices, choices[0]
# DIT options
dit_choices, dit_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16")
# T5 options - check if original file exists
t5_choices, t5_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16", t5_bf16_exists)
# CLIP options - check if original file exists
clip_choices, clip_default = get_choices(has_subdirs, "fp16", "fp8", "int8", "fp16", clip_fp16_exists)
return {"dit_choices": dit_choices, "dit_default": dit_default, "t5_choices": t5_choices, "t5_default": t5_default, "clip_choices": clip_choices, "clip_default": clip_default}
global_runner = None
current_config = None
cur_dit_quant_scheme = None
......@@ -222,6 +301,8 @@ def run_inference(
if os.path.exists(os.path.join(model_path, "config.json")):
with open(os.path.join(model_path, "config.json"), "r") as f:
model_config = json.load(f)
else:
model_config = {}
if task == "t2v":
if model_size == "1.3b":
......@@ -305,17 +386,23 @@ def run_inference(
is_dit_quant = dit_quant_scheme != "bf16"
is_t5_quant = t5_quant_scheme != "bf16"
if is_t5_quant:
t5_path = os.path.join(model_path, t5_quant_scheme)
t5_quant_ckpt = os.path.join(t5_path, f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth")
t5_model_name = f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth"
t5_quant_ckpt = find_torch_model_path(model_path, t5_model_name, t5_quant_scheme)
t5_original_ckpt = None
else:
t5_quant_ckpt = None
t5_model_name = "models_t5_umt5-xxl-enc-bf16.pth"
t5_original_ckpt = find_torch_model_path(model_path, t5_model_name, "original")
is_clip_quant = clip_quant_scheme != "fp16"
if is_clip_quant:
clip_path = os.path.join(model_path, clip_quant_scheme)
clip_quant_ckpt = os.path.join(clip_path, f"clip-{clip_quant_scheme}.pth")
clip_model_name = f"clip-{clip_quant_scheme}.pth"
clip_quant_ckpt = find_torch_model_path(model_path, clip_model_name, clip_quant_scheme)
clip_original_ckpt = None
else:
clip_quant_ckpt = None
clip_model_name = "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
clip_original_ckpt = find_torch_model_path(model_path, clip_model_name, "original")
needs_reinit = (
lazy_load
......@@ -356,7 +443,7 @@ def run_inference(
t5_quant_scheme = f"{t5_quant_scheme}-q8f"
clip_quant_scheme = f"{clip_quant_scheme}-q8f"
dit_quantized_ckpt = os.path.join(model_path, dit_quant_scheme)
dit_quantized_ckpt = find_hf_model_path(model_path, dit_quant_scheme)
if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
quant_model_config = json.load(f)
......@@ -394,15 +481,18 @@ def run_inference(
"teacache_thresh": teacache_thresh,
"t5_cpu_offload": t5_cpu_offload,
"unload_modules": unload_modules,
"t5_original_ckpt": t5_original_ckpt,
"t5_quantized": is_t5_quant,
"t5_quantized_ckpt": t5_quant_ckpt,
"t5_quant_scheme": t5_quant_scheme,
"clip_original_ckpt": clip_original_ckpt,
"clip_quantized": is_clip_quant,
"clip_quantized_ckpt": clip_quant_ckpt,
"clip_quant_scheme": clip_quant_scheme,
"vae_path": find_torch_model_path(model_path, "Wan2.1_VAE.pth"),
"use_tiling_vae": use_tiling_vae,
"use_tiny_vae": use_tiny_vae,
"tiny_vae_path": (os.path.join(model_path, "taew2_1.pth") if use_tiny_vae else None),
"tiny_vae_path": (find_torch_model_path(model_path, "taew2_1.pth") if use_tiny_vae else None),
"lazy_load": lazy_load,
"do_mm_calib": False,
"parallel_attn_type": None,
......@@ -743,9 +833,6 @@ def auto_configure(enable_auto_config, resolution):
def main():
def toggle_image_input(task):
return gr.update(visible=(task == "i2v"))
with gr.Blocks(
title="Lightx2v (Lightweight Video Inference and Generation Engine)",
css="""
......@@ -814,7 +901,7 @@ def main():
lines=3,
placeholder="What you don't want to appear in the video...",
max_lines=5,
value="镜头晃动,色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
value="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards",
)
with gr.Column():
resolution = gr.Dropdown(
......@@ -1045,22 +1132,25 @@ def main():
info="Select the quantization matrix multiplication operator to accelerate inference",
interactive=True,
)
# Get dynamic quantization options
quant_options = get_quantization_options(model_path)
dit_quant_scheme = gr.Dropdown(
label="Dit",
choices=["fp8", "int8", "bf16"],
value="bf16",
choices=quant_options["dit_choices"],
value=quant_options["dit_default"],
info="Quantization precision for the Dit model",
)
t5_quant_scheme = gr.Dropdown(
label="T5 Encoder",
choices=["fp8", "int8", "bf16"],
value="bf16",
choices=quant_options["t5_choices"],
value=quant_options["t5_default"],
info="Quantization precision for the T5 Encoder model",
)
clip_quant_scheme = gr.Dropdown(
label="Clip Encoder",
choices=["fp8", "int8", "fp16"],
value="fp16",
choices=quant_options["clip_choices"],
value=quant_options["clip_default"],
info="Quantization precision for the Clip Encoder",
)
precision_mode = gr.Dropdown(
......
......@@ -11,6 +11,7 @@ from loguru import logger
import importlib.util
import psutil
import random
import glob
logger.add(
"inference_logs.log",
......@@ -24,6 +25,40 @@ logger.add(
MAX_NUMPY_SEED = 2**32 - 1
def find_hf_model_path(model_path, subdir=["original", "fp8", "int8"]):
paths_to_check = [model_path]
if isinstance(subdir, list):
for sub in subdir:
paths_to_check.append(os.path.join(model_path, sub))
else:
paths_to_check.append(os.path.join(model_path, subdir))
for path in paths_to_check:
safetensors_pattern = os.path.join(path, "*.safetensors")
safetensors_files = glob.glob(safetensors_pattern)
if safetensors_files:
logger.info(f"Found Hugging Face model files in: {path}")
return path
raise FileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
def find_torch_model_path(model_path, filename=None, subdir=["original", "fp8", "int8"]):
paths_to_check = [
os.path.join(model_path, filename),
]
if isinstance(subdir, list):
for sub in subdir:
paths_to_check.append(os.path.join(model_path, sub, filename))
else:
paths_to_check.append(os.path.join(model_path, subdir, filename))
print(paths_to_check)
for path in paths_to_check:
if os.path.exists(path):
logger.info(f"Found PyTorch model checkpoint: {path}")
return path
raise FileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
def generate_random_seed():
return random.randint(0, MAX_NUMPY_SEED)
......@@ -154,6 +189,50 @@ def is_ada_architecture_gpu():
return False
def get_quantization_options(model_path):
"""根据model_path动态获取量化选项"""
import os
# 检查子目录
subdirs = ["original", "fp8", "int8"]
has_subdirs = {subdir: os.path.exists(os.path.join(model_path, subdir)) for subdir in subdirs}
# 检查根目录下的原始文件
t5_bf16_exists = os.path.exists(os.path.join(model_path, "models_t5_umt5-xxl-enc-bf16.pth"))
clip_fp16_exists = os.path.exists(os.path.join(model_path, "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"))
# 生成选项
def get_choices(has_subdirs, original_type, fp8_type, int8_type, fallback_type, has_original_file=False):
choices = []
if has_subdirs["original"]:
choices.append(original_type)
if has_subdirs["fp8"]:
choices.append(fp8_type)
if has_subdirs["int8"]:
choices.append(int8_type)
# 如果没有子目录但有原始文件,添加原始类型
if not choices and has_original_file:
choices.append(original_type)
# 如果没有任何选项,使用默认值
if not choices:
choices = [fallback_type]
return choices, choices[0]
# DIT选项
dit_choices, dit_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16")
# T5选项 - 检查是否有原始文件
t5_choices, t5_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16", t5_bf16_exists)
# CLIP选项 - 检查是否有原始文件
clip_choices, clip_default = get_choices(has_subdirs, "fp16", "fp8", "int8", "fp16", clip_fp16_exists)
return {"dit_choices": dit_choices, "dit_default": dit_default, "t5_choices": t5_choices, "t5_default": t5_default, "clip_choices": clip_choices, "clip_default": clip_default}
global_runner = None
current_config = None
cur_dit_quant_scheme = None
......@@ -224,6 +303,8 @@ def run_inference(
if os.path.exists(os.path.join(model_path, "config.json")):
with open(os.path.join(model_path, "config.json"), "r") as f:
model_config = json.load(f)
else:
model_config = {}
if task == "t2v":
if model_size == "1.3b":
......@@ -306,18 +387,26 @@ def run_inference(
is_dit_quant = dit_quant_scheme != "bf16"
is_t5_quant = t5_quant_scheme != "bf16"
if is_t5_quant:
t5_path = os.path.join(model_path, t5_quant_scheme)
t5_quant_ckpt = os.path.join(t5_path, f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth")
t5_model_name = f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth"
t5_quantized_ckpt = find_torch_model_path(model_path, t5_model_name, t5_quant_scheme)
t5_original_ckpt = None
else:
t5_quant_ckpt = None
t5_quantized_ckpt = None
t5_model_name = "models_t5_umt5-xxl-enc-bf16.pth"
t5_original_ckpt = find_torch_model_path(model_path, t5_model_name, "original")
is_clip_quant = clip_quant_scheme != "fp16"
if is_clip_quant:
clip_path = os.path.join(model_path, clip_quant_scheme)
clip_quant_ckpt = os.path.join(clip_path, f"clip-{clip_quant_scheme}.pth")
clip_model_name = f"clip-{t5_quant_scheme}.pth"
clip_quantized_ckpt = find_torch_model_path(model_path, clip_model_name, clip_quant_scheme)
clip_original_ckpt = None
else:
clip_quant_ckpt = None
clip_quantized_ckpt = None
clip_model_name = "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
clip_original_ckpt = find_torch_model_path(model_path, clip_model_name, "original")
needs_reinit = (
lazy_load
......@@ -358,7 +447,7 @@ def run_inference(
t5_quant_scheme = f"{t5_quant_scheme}-q8f"
clip_quant_scheme = f"{clip_quant_scheme}-q8f"
dit_quantized_ckpt = os.path.join(model_path, dit_quant_scheme)
dit_quantized_ckpt = find_hf_model_path(model_path, dit_quant_scheme)
if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
quant_model_config = json.load(f)
......@@ -394,17 +483,20 @@ def run_inference(
"coefficients": coefficient[0] if use_ret_steps else coefficient[1],
"use_ret_steps": use_ret_steps,
"teacache_thresh": teacache_thresh,
"t5_original_ckpt": t5_original_ckpt,
"t5_cpu_offload": t5_cpu_offload,
"unload_modules": unload_modules,
"t5_quantized": is_t5_quant,
"t5_quantized_ckpt": t5_quant_ckpt,
"t5_quantized_ckpt": t5_quantized_ckpt,
"t5_quant_scheme": t5_quant_scheme,
"clip_original_ckpt": clip_original_ckpt,
"clip_quantized": is_clip_quant,
"clip_quantized_ckpt": clip_quant_ckpt,
"clip_quantized_ckpt": clip_quantized_ckpt,
"clip_quant_scheme": clip_quant_scheme,
"vae_path": find_torch_model_path(model_path, "Wan2.1_VAE.pth"),
"use_tiling_vae": use_tiling_vae,
"use_tiny_vae": use_tiny_vae,
"tiny_vae_path": (os.path.join(model_path, "taew2_1.pth") if use_tiny_vae else None),
"tiny_vae_path": (find_torch_model_path(model_path, "taew2_1.pth") if use_tiny_vae else None),
"lazy_load": lazy_load,
"do_mm_calib": False,
"parallel_attn_type": None,
......@@ -745,9 +837,6 @@ def auto_configure(enable_auto_config, resolution):
def main():
def toggle_image_input(task):
return gr.update(visible=(task == "i2v"))
with gr.Blocks(
title="Lightx2v (轻量级视频推理和生成引擎)",
css="""
......@@ -1045,22 +1134,25 @@ def main():
info="选择量化矩阵乘法算子以加速推理",
interactive=True,
)
# 获取动态量化选项
quant_options = get_quantization_options(model_path)
dit_quant_scheme = gr.Dropdown(
label="Dit",
choices=["fp8", "int8", "bf16"],
value="bf16",
choices=quant_options["dit_choices"],
value=quant_options["dit_default"],
info="Dit模型的量化精度",
)
t5_quant_scheme = gr.Dropdown(
label="T5编码器",
choices=["fp8", "int8", "bf16"],
value="bf16",
choices=quant_options["t5_choices"],
value=quant_options["t5_default"],
info="T5编码器模型的量化精度",
)
clip_quant_scheme = gr.Dropdown(
label="Clip编码器",
choices=["fp8", "int8", "fp16"],
value="fp16",
choices=quant_options["clip_choices"],
value=quant_options["clip_default"],
info="Clip编码器的量化精度",
)
precision_mode = gr.Dropdown(
......
......@@ -14,11 +14,11 @@
# Lightx2v project root directory path
# Example: /home/user/lightx2v or /data/video_gen/lightx2v
lightx2v_path=/path/to/lightx2v
lightx2v_path=/data/video_gen/LightX2V
# Model path configuration
# Image-to-video model path (for i2v tasks)
# Example: /path/to/Wan2.1-I2V-14B-720P-Lightx2v
i2v_model_path=/path/to/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v/
i2v_model_path=/wan_0726/wan_test/fp8
# Text-to-video model path (for t2v tasks)
# Example: /path/to/Wan2.1-T2V-1.3B
......
......@@ -70,6 +70,20 @@ Install according to the project homepage tutorials for each operator as needed
- **`wan2.1`**: Standard model, provides the best video generation quality, suitable for scenarios with extremely high quality requirements
- **`wan2.1_distill`**: Distilled model, optimized through knowledge distillation technology, significantly improves inference speed, maintains good quality while greatly reducing computation time, suitable for most application scenarios
**📥 Model Download**:
Refer to the [Model Structure Documentation](./model_structure.md) to download complete models (including quantized and non-quantized versions) or download only quantized/non-quantized versions.
**Download Options**:
- **Complete Model**: When downloading complete models with both quantized and non-quantized versions, you can freely choose the quantization precision for DIT/T5/CLIP in the advanced options of the `Gradio` Web frontend.
- **Non-quantized Version Only**: When downloading only non-quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to bf16/fp16. If you need to use quantized versions of models, please manually download quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
- **Quantized Version Only**: When downloading only quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to fp8 or int8 (depending on the weights you downloaded). If you need to use non-quantized versions of models, please manually download non-quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
- **Note**: Whether you download complete models or partial models, the values for `i2v_model_path` and `t2v_model_path` parameters should be the first-level directory paths. For example: `Wan2.1-I2V-14B-480P-Lightx2v/`, not `Wan2.1-I2V-14B-480P-Lightx2v/int8`.
### Startup Methods
#### Method 1: Using Startup Script (Recommended)
......
......@@ -41,6 +41,7 @@ After extraction, ensure the directory structure is as follows:
├── start_lightx2v.bat # One-click startup script
├── lightx2v_config.txt # Configuration file
├── LightX2V使用说明.txt # LightX2V usage instructions
├── outputs/ # Generated video save directory
└── models/ # Model storage directory
├── 说明.txt # Model documentation
├── Wan2.1-I2V-14B-480P-Lightx2v/ # Image-to-video model (480P)
......@@ -52,6 +53,20 @@ After extraction, ensure the directory structure is as follows:
└── Wan2.1-T2V-14B-StepDistill-CfgDistill-Lightx2v/ # Text-to-video model (4-step distillation)
```
**📥 Model Download**:
Refer to the [Model Structure Documentation](./model_structure.md) to download complete models (including quantized and non-quantized versions) or download only quantized/non-quantized versions.
**Download Options**:
- **Complete Model**: When downloading complete models with both quantized and non-quantized versions, you can freely choose the quantization precision for DIT/T5/CLIP in the advanced options of the `Gradio` Web frontend.
- **Non-quantized Version Only**: When downloading only non-quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to bf16/fp16. If you need to use quantized versions of models, please manually download quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
- **Quantized Version Only**: When downloading only quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to fp8 or int8 (depending on the weights you downloaded). If you need to use non-quantized versions of models, please manually download non-quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
- **Note**: Whether you download complete models or partial models, the values for `i2v_model_path` and `t2v_model_path` parameters should be the first-level directory paths. For example: `Wan2.1-I2V-14B-480P-Lightx2v/`, not `Wan2.1-I2V-14B-480P-Lightx2v/int8`.
**📋 Configuration Parameters**
Edit the `lightx2v_config.txt` file and modify the following parameters as needed:
......@@ -74,6 +89,12 @@ model_size=14b
# Model class (wan2.1: standard model, wan2.1_distill: distilled model)
model_cls=wan2.1
# Image-to-video model path
i2v_model_path=models/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v
# Text-to-video model path
t2v_model_path=models/Wan2.1-T2V-1.3B-Lightx2v
```
**⚠️ Important Note**: If using distilled models (model names containing StepDistill-CfgDistil field), please set `model_cls` to `wan2.1_distill`
......
# Model Quantization
LightX2V supports quantization inference for linear layers in `Dit`, supporting `w8a8-int8`, `w8a8-fp8`, `w8a8-fp8block`, `w8a8-mxfp8`, and `w4a4-nvfp4` matrix multiplication.
LightX2V supports quantization inference for linear layers in `Dit`, supporting `w8a8-int8`, `w8a8-fp8`, `w8a8-fp8block`, `w8a8-mxfp8`, and `w4a4-nvfp4` matrix multiplication. Additionally, LightX2V also supports quantization of T5 and CLIP encoders to further improve inference performance.
## 📊 Quantization Scheme Overview
## Producing Quantized Models
### DIT Model Quantization
LightX2V supports multiple DIT matrix multiplication quantization schemes, configured through the `mm_type` parameter:
#### Supported mm_type Types
| mm_type | Weight Quantization | Activation Quantization | Compute Kernel |
|---------|-------------------|------------------------|----------------|
| `Default` | No Quantization | No Quantization | PyTorch |
| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm` | FP8 Channel Symmetric | FP8 Channel Dynamic Symmetric | VLLM |
| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm` | INT8 Channel Symmetric | INT8 Channel Dynamic Symmetric | VLLM |
| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F` | FP8 Channel Symmetric | FP8 Channel Dynamic Symmetric | Q8F |
| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F` | INT8 Channel Symmetric | INT8 Channel Dynamic Symmetric | Q8F |
| `W-fp8-block128-sym-A-fp8-channel-group128-sym-dynamic-Deepgemm` | FP8 Block Symmetric | FP8 Channel Group Symmetric | DeepGEMM |
| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl` | FP8 Channel Symmetric | FP8 Channel Dynamic Symmetric | SGL |
#### Detailed Quantization Scheme Description
**FP8 Quantization Scheme**:
- **Weight Quantization**: Uses `torch.float8_e4m3fn` format with per-channel symmetric quantization
- **Activation Quantization**: Dynamic quantization supporting per-token and per-channel modes
- **Advantages**: Provides optimal performance on FP8-supported GPUs with minimal precision loss (typically <1%)
- **Compatible Hardware**: H100, A100, RTX 40 series and other FP8-supported GPUs
**INT8 Quantization Scheme**:
- **Weight Quantization**: Uses `torch.int8` format with per-channel symmetric quantization
- **Activation Quantization**: Dynamic quantization supporting per-token mode
- **Advantages**: Best compatibility, suitable for most GPU hardware, reduces memory usage by ~50%
- **Compatible Hardware**: All INT8-supported GPUs
**Block Quantization Scheme**:
- **Weight Quantization**: FP8 quantization by 128x128 blocks
- **Activation Quantization**: Quantization by channel groups (group size 128)
- **Advantages**: Particularly suitable for large models with higher memory efficiency, supports larger batch sizes
### T5 Encoder Quantization
T5 encoder supports the following quantization schemes:
#### Supported quant_scheme Types
| quant_scheme | Quantization Precision | Compute Kernel |
|--------------|----------------------|----------------|
| `int8` | INT8 | VLLM |
| `fp8` | FP8 | VLLM |
| `int8-torchao` | INT8 | TorchAO |
| `int8-q8f` | INT8 | Q8F |
| `fp8-q8f` | FP8 | Q8F |
### CLIP Encoder Quantization
CLIP encoder supports the same quantization schemes as T5
## 🚀 Producing Quantized Models
Download quantized models from the [LightX2V Official Model Repository](https://huggingface.co/lightx2v), refer to the [Model Structure Documentation](../deploy_guides/model_structure.md) for details.
Use LightX2V's convert tool to convert models into quantized models. Refer to the [documentation](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme.md).
## Loading Quantized Models for Inference
## 📥 Loading Quantized Models for Inference
### DIT Model Configuration
Write the path of the converted quantized weights to the `dit_quantized_ckpt` field in the [configuration file](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization).
By specifying --config_json to the specific config file, you can load the quantized model for inference.
```json
{
"dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}
}
```
### T5 Encoder Configuration
```json
{
"t5_quantized": true,
"t5_quant_scheme": "fp8",
"t5_quantized_ckpt": "/path/to/t5_quantized_ckpt"
}
```
### CLIP Encoder Configuration
```json
{
"clip_quantized": true,
"clip_quant_scheme": "fp8",
"clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
}
```
### Complete Configuration Example
```json
{
"dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
"t5_quantized": true,
"t5_quant_scheme": "fp8",
"t5_quantized_ckpt": "/path/to/t5_quantized_ckpt",
"clip_quantized": true,
"clip_quant_scheme": "fp8",
"clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
}
```
By specifying `--config_json` to the specific config file, you can load the quantized model for inference.
[Here](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization) are some running scripts for use.
## Advanced Quantization Features
## 💡 Quantization Scheme Selection Recommendations
### Hardware Compatibility
- **H100/A100 GPU/RTX 4090/RTX 4060**: Recommended to use FP8 quantization schemes
- DIT: `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm`
- T5/CLIP: `fp8`
- **A100/RTX 3090/RTX 3060**: Recommended to use INT8 quantization schemes
- DIT: `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm`
- T5/CLIP: `int8`
- **Other GPUs**: Choose based on hardware support
### Performance Optimization
- **Memory Constrained**: Choose INT8 quantization schemes
- **Speed Priority**: Choose FP8 quantization schemes
- **High Precision Requirements**: Use FP8 or mixed precision schemes
### Mixed Quantization Strategy
You can choose different quantization schemes for different components:
```json
{
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
"t5_quantized": true,
"t5_quant_scheme": "int8",
"clip_quantized": true,
"clip_quant_scheme": "fp8"
}
```
## 🔧 Advanced Quantization Features
For details, please refer to the documentation of the quantization tool [LLMC](https://github.com/ModelTC/llmc/blob/main/docs/en/source/backend/lightx2v.md)
### Custom Quantization Kernels
LightX2V supports custom quantization kernels that can be extended in the following ways:
1. **Register New mm_type**: Add new quantization classes in `mm_weight.py`
2. **Implement Quantization Functions**: Define quantization methods for weights and activations
3. **Integrate Compute Kernels**: Use custom matrix multiplication implementations
## 🚨 Important Notes
1. **Hardware Requirements**: FP8 quantization requires FP8-supported GPUs (such as H100, RTX 40 series)
2. **Precision Impact**: Quantization will bring certain precision loss, which needs to be weighed based on application scenarios
3. **Model Compatibility**: Ensure quantized models are compatible with inference code versions
4. **Memory Management**: Pay attention to memory usage when loading quantized models
5. **Quantization Calibration**: It is recommended to use representative datasets for quantization calibration to achieve optimal results
## 📚 Related Resources
- [Quantization Tool Documentation](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme.md)
- [Running Scripts](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization)
- [Configuration File Examples](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization)
- [LLMC Quantization Documentation](https://github.com/ModelTC/llmc/blob/main/docs/en/source/backend/lightx2v.md)
......@@ -70,6 +70,20 @@ LightX2V/app/
- **`wan2.1`**: 标准模型,提供最佳的视频生成质量,适合对质量要求极高的场景
- **`wan2.1_distill`**: 蒸馏模型,通过知识蒸馏技术优化,推理速度显著提升,在保持良好质量的同时大幅减少计算时间,适合大多数应用场景
**📥 下载模型**:
可参考[模型结构文档](./model_structure.md)下载完整模型(包含量化和非量化版本)或仅下载量化/非量化版本。
**下载选项说明**
- **完整模型**:下载包含量化和非量化版本的完整模型时,在`Gradio` Web前端的高级选项中可以自由选择DIT/T5/CLIP的量化精度。
- **仅非量化版本**:仅下载非量化版本时,在`Gradio` Web前端中,`DIT/T5/CLIP`的量化精度只能选择bf16/fp16。如需使用量化版本的模型,请手动下载量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
- **仅量化版本**:仅下载量化版本时,在`Gradio` Web前端中,`DIT/T5/CLIP`的量化精度只能选择fp8或int8(取决于您下载的权重)。如需使用非量化版本的模型,请手动下载非量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
- **注意**:无论是下载了完整模型还是部分模型,`i2v_model_path``t2v_model_path` 参数的值都应该是一级目录的路径。例如:`Wan2.1-I2V-14B-480P-Lightx2v/`,而不是 `Wan2.1-I2V-14B-480P-Lightx2v/int8`
### 启动方式
#### 方式一:使用启动脚本(推荐)
......
......@@ -52,6 +52,20 @@
└── Wan2.1-T2V-14B-StepDistill-CfgDistill-Lightx2v/ # 文本转视频模型(4步蒸馏)
```
**📥 下载模型**:
可参考[模型结构文档](./model_structure.md)下载完整模型(包含量化和非量化版本)或仅下载量化/非量化版本。
**下载选项说明**
- **完整模型**:下载包含量化和非量化版本的完整模型时,在`Gradio` Web前端的高级选项中可以自由选择DIT/T5/CLIP的量化精度。
- **仅非量化版本**:仅下载非量化版本时,在`Gradio` Web前端中,`DIT/T5/CLIP`的量化精度只能选择bf16/fp16。如需使用量化版本的模型,请手动下载量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
- **仅量化版本**:仅下载量化版本时,在`Gradio` Web前端中,`DIT/T5/CLIP`的量化精度只能选择fp8或int8(取决于您下载的权重)。如需使用非量化版本的模型,请手动下载非量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
- **注意**:无论是下载了完整模型还是部分模型,`i2v_model_path``t2v_model_path` 参数的值都应该是一级目录的路径。例如:`Wan2.1-I2V-14B-480P-Lightx2v/`,而不是 `Wan2.1-I2V-14B-480P-Lightx2v/int8`
**📋 配置参数**
编辑 `lightx2v_config.txt` 文件,根据需要修改以下参数:
......@@ -74,6 +88,12 @@ model_size=14b
# 模型类别 (wan2.1: 标准模型, wan2.1_distill: 蒸馏模型)
model_cls=wan2.1
# 图像转视频模型路径
i2v_model_path=models/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v
# 文本转视频模型路径
t2v_model_path=models/Wan2.1-T2V-1.3B-Lightx2v
```
**⚠️ 重要提示**: 如果使用蒸馏模型(模型名称包含StepDistill-CfgDistil字段),请将`model_cls`设置为`wan2.1_distill`
......
......@@ -2,133 +2,350 @@
## 📖 概述
本文档介绍 Lightx2v 项目的模型目录结构,帮助用户正确组织模型文件,实现便捷的使用体验。通过合理的目录组织,用户可以享受"一键启动"的便利,无需手动配置复杂的路径参数。
本文档全面介绍 LightX2V 项目的模型目录结构,旨在帮助用户高效组织模型文件,实现便捷的使用体验。通过科学的目录组织方式,用户可以享受"一键启动"的便利,无需手动配置复杂的路径参数。同时,系统也支持灵活的手动路径配置,满足不同用户群体的多样化需求。
## 🗂️ 模型目录结构
### Lightx2v官方模型列表
查看所有可用模型:[Lightx2v官方模型仓库](https://huggingface.co/lightx2v)
### LightX2V 官方模型列表
查看所有可用模型:[LightX2V 官方模型仓库](https://huggingface.co/lightx2v)
### 标准目录结构
`Wan2.1-I2V-14B-480P-Lightx2v` 为例:
`Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V` 为例,标准文件结构如下
```
模型根目录/
├── Wan2.1-I2V-14B-480P-Lightx2v/
│ ├── config.json # 模型配置文件
│ ├── Wan2.1_VAE.pth # VAE变分自编码器
│ ├── models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth # CLIP视觉编码器 (FP16)
│ ├── models_t5_umt5-xxl-enc-bf16.pth # T5文本编码器 (BF16)
│ ├── taew2_1.pth # 轻量级VAE (可选)
│ ├── fp8/ # FP8量化版本 (DIT/T5/CLIP)
│ ├── int8/ # INT8量化版本 (DIT/T5/CLIP)
│ ├── original/ # 原始精度版本 (DIT)
│ ├── xlm-roberta-large/
│ └── google/
Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/
├── fp8/ # FP8 量化版本 (DIT/T5/CLIP)
│ ├── block_xx.safetensors # DIT 模型 FP8 量化版本
│ ├── models_t5_umt5-xxl-enc-fp8.pth # T5 编码器 FP8 量化版本
│ ├── clip-fp8.pth # CLIP 编码器 FP8 量化版本
│ ├── Wan2.1_VAE.pth # VAE 变分自编码器
│ ├── taew2_1.pth # 轻量级 VAE (可选)
│ └── config.json # 模型配置文件
├── int8/ # INT8 量化版本 (DIT/T5/CLIP)
│ ├── block_xx.safetensors # DIT 模型 INT8 量化版本
│ ├── models_t5_umt5-xxl-enc-int8.pth # T5 编码器 INT8 量化版本
│ ├── clip-int8.pth # CLIP 编码器 INT8 量化版本
│ ├── Wan2.1_VAE.pth # VAE 变分自编码器
│ ├── taew2_1.pth # 轻量级 VAE (可选)
│ └── config.json # 模型配置文件
├── original/ # 原始精度版本 (DIT/T5/CLIP)
│ ├── distill_model.safetensors # DIT 模型原始精度版本
│ ├── models_t5_umt5-xxl-enc-bf16.pth # T5 编码器原始精度版本
│ ├── models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth # CLIP 编码器原始精度版本
│ ├── Wan2.1_VAE.pth # VAE 变分自编码器
│ ├── taew2_1.pth # 轻量级 VAE (可选)
│ └── config.json # 模型配置文件
```
### 💾 存储建议
**强烈建议将模型文件存储在SSD固态硬盘上**,可显著提升模型加载速度和推理性能。
**强烈建议将模型文件存储在 SSD 固态硬盘上**此举可显著提升模型加载速度和推理性能。
**推荐存储路径**
```bash
/mnt/ssd/models/ # 独立SSD挂载点
/data/ssd/models/ # 数据SSD目录
/mnt/ssd/models/ # 独立 SSD 挂载点
/data/ssd/models/ # 数据 SSD 目录
/opt/models/ # 系统优化目录
```
## 🔧 模型文件说明
### 量化版本说明
每个模型均包含多个量化版本,适配不同硬件配置需求:
- **FP8 版本**:适用于支持 FP8 的 GPU(如 H100、A100、RTX 40系列),提供最佳性能表现
- **INT8 版本**:适用于大多数 GPU,在性能和兼容性间取得平衡,内存占用减少约50%
- **原始精度版本**:适用于对精度要求极高的应用场景,提供最高质量输出
## 🚀 使用方法
### 环境准备
#### 安装 Hugging Face CLI
### 量化版本目录
在开始下载模型之前,请确保已正确安装 Hugging Face CLI:
每个模型都包含多个量化版本,用于不同硬件配置:
```bash
# 安装 huggingface_hub
pip install huggingface_hub
# 或者安装 huggingface-cli
pip install huggingface-cli
# 登录 Hugging Face(可选,但强烈推荐)
huggingface-cli login
```
模型目录/
├── fp8/ # FP8量化版本 (H100/A100等高端GPU)
├── int8/ # INT8量化版本 (通用GPU)
└── original/ # 原始精度版本 (DIT)
### 方式一:完整模型下载(推荐)
**优势**:下载完整模型后,系统将自动识别所有组件路径,无需手动配置,使用体验更加便捷
#### 1. 下载完整模型
```bash
# 使用 Hugging Face CLI 下载完整模型
huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V
```
**💡 使用全精度模型**:如需使用全精度模型,只需将官方权重文件复制到 `original/` 目录即可。
#### 2. 启动推理
## 🚀 使用方法
##### Bash 脚本启动
###### 场景一:使用全精度模型
修改[运行脚本](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh)中的配置:
- `model_path`:设置为下载的模型路径 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`
- `lightx2v_path`:设置为 `LightX2V` 项目根目录路径
###### 场景二:使用量化模型
当使用完整模型时,如需启用量化功能,请在[配置文件](https://github.com/ModelTC/LightX2V/tree/main/configs/distill/wan_i2v_distill_4step_cfg.json)中添加以下配置:
```json
{
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}, // DIT 模型量化方案
"t5_quantized": true, // 启用 T5 量化
"t5_quant_scheme": "fp8", // T5 量化模式
"clip_quantized": true, // 启用 CLIP 量化
"clip_quant_scheme": "fp8" // CLIP 量化模式
}
```
### Gradio界面启动
> **重要提示**:各模型的量化配置可以灵活组合。量化路径无需手动指定,系统将自动定位各模型的量化版本。
使用Gradio界面时,只需指定模型根目录路径:
有关量化技术的详细说明,请参考[量化文档](../method_tutorials/quantization.md)
使用提供的 bash 脚本快速启动:
```bash
# 图像到视频 (I2V)
cd LightX2V/scripts
bash wan/run_wan_t2v_distill_4step_cfg.sh
```
##### Gradio 界面启动
通过 Gradio 界面进行推理时,只需在启动时指定模型根目录路径,轻量级 VAE 等可通过前端界面按钮灵活选择:
```bash
# 图像到视频推理 (I2V)
python gradio_demo_zh.py \
--model_path /path/to/Wan2.1-I2V-14B-480P-Lightx2v \
--model_path ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--model_size 14b \
--task i2v
--task i2v \
--model_cls wan2.1_distill
```
### 方式二:选择性下载
**优势**:仅下载所需的版本(量化或非量化),有效节省存储空间和下载时间
# 文本到视频 (T2V)
#### 1. 选择性下载
```bash
# 使用 Hugging Face CLI 选择性下载非量化版本
huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--include "original/*"
```
```bash
# 使用 Hugging Face CLI 选择性下载 FP8 量化版本
huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--include "fp8/*"
```
```bash
# 使用 Hugging Face CLI 选择性下载 INT8 量化版本
huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
--include "int8/*"
```
> **重要提示**:当启动推理脚本或Gradio时,`model_path` 参数仍需要指定为不包含 `--include` 的完整路径。例如:`model_path=./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`,而不是 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/int8`。
#### 2. 启动推理
**以只下载了FP8版本的模型为例:**
##### Bash 脚本启动
###### 场景一:使用 FP8 DIT + FP8 T5 + FP8 CLIP
[运行脚本](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh)中的 `model_path` 指定为您下载好的模型路径 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/``lightx2v_path` 指定为您的 `LightX2V` 项目路径。
仅需修改配置文件中的量化模型配置如下:
```json
{
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}, // DIT 的量化方案
"t5_quantized": true, // 是否使用 T5 量化版本
"t5_quant_scheme": "fp8", // T5 的量化模式
"clip_quantized": true, // 是否使用 CLIP 量化版本
"clip_quant_scheme": "fp8", // CLIP 的量化模式
}
```
> **重要提示**:此时各模型只能指定为量化版本。量化路径无需手动指定,系统将自动定位各模型的量化版本。
###### 场景二:使用 FP8 DIT + 原始精度 T5 + 原始精度 CLIP
[运行脚本](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh)中的 `model_path` 指定为您下载好的模型路径 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V``lightx2v_path` 指定为您的 `LightX2V` 项目路径。
由于仅下载了量化权重,需要手动下载 T5 和 CLIP 的原始精度版本,并在配置文件的 `t5_original_ckpt``clip_original_ckpt` 中配置如下:
```json
{
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}, // DIT 的量化方案
"t5_original_ckpt": "/path/to/models_t5_umt5-xxl-enc-bf16.pth",
"clip_original_ckpt": "/path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
}
```
使用提供的 bash 脚本快速启动:
```bash
cd LightX2V/scripts
bash wan/run_wan_t2v_distill_4step_cfg.sh
```
##### Gradio 界面启动
通过 Gradio 界面进行推理时,启动时指定模型根目录路径:
```bash
# 图像到视频推理 (I2V)
python gradio_demo_zh.py \
--model_path /path/to/models/Wan2.1-T2V-14B-Lightx2v \
--model_path ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/ \
--model_size 14b \
--task t2v
--task i2v \
--model_cls wan2.1_distill
```
### 配置文件启动
> **重要提示**:由于模型根目录下仅包含各模型的量化版本,前端使用时,对于 DIT/T5/CLIP 模型的量化精度只能选择 fp8。如需使用非量化版本的T5/CLIP,请手动下载非量化权重并放置到gradio_demo的model_path目录(`./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/`)下,此时T5/CLIP的量化精度可以选择bf16/fp16。
使用配置文件启动时, 如[配置文件](https://github.com/ModelTC/LightX2V/tree/main/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json)中的以下路径配置可以省略:
### 方式三:手动配置
- `dit_quantized_ckpt` 无需指定,代码会自动在模型目录下查找
- `tiny_vae_path`:无需指定,代码会自动在模型目录下查找
- `clip_quantized_ckpt`:无需指定,代码会自动在模型目录下查找
- `t5_quantized_ckpt`:无需指定,代码会自动在模型目录下查找
用户可根据实际需求灵活配置各个组件的量化选项和路径,实现量化与非量化组件的混合使用。请确保所需的模型权重已正确下载并放置在指定路径。
**💡 简化配置**:按照推荐的目录结构组织模型文件后,大部分路径配置都可以省略,代码会自动处理。
#### DIT 模型配置
```json
{
"dit_quantized_ckpt": "/path/to/dit_quantized_ckpt", // DIT 量化权重路径
"dit_original_ckpt": "/path/to/dit_original_ckpt", // DIT 原始精度权重路径
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm" // DIT 矩阵乘算子类型,非量化时指定为 "Default"
}
}
```
### 手动下载
#### T5 模型配置
1. 访问 [Hugging Face模型页面](https://huggingface.co/lightx2v)
2. 选择需要的模型版本
3. 下载所有文件到对应目录
```json
{
"t5_quantized_ckpt": "/path/to/t5_quantized_ckpt", // T5 量化权重路径
"t5_original_ckpt": "/path/to/t5_original_ckpt", // T5 原始精度权重路径
"t5_quantized": true, // 是否启用 T5 量化
"t5_quant_scheme": "fp8" // T5 量化模式,仅在 t5_quantized true 时生效
}
```
**💡 下载建议**:建议使用SSD存储,并确保网络连接稳定。对于大文件,可使用 `git lfs` 或下载工具如 `aria2c`
#### CLIP 模型配置
```json
{
"clip_quantized_ckpt": "/path/to/clip_quantized_ckpt", // CLIP 量化权重路径
"clip_original_ckpt": "/path/to/clip_original_ckpt", // CLIP 原始精度权重路径
"clip_quantized": true, // 是否启用 CLIP 量化
"clip_quant_scheme": "fp8" // CLIP 量化模式,仅在 clip_quantized true 时生效
}
```
#### VAE 模型配置
```json
{
"vae_pth": "/path/to/Wan2.1_VAE.pth", // 原始 VAE 模型路径
"use_tiny_vae": true, // 是否使用轻量级 VAE
"tiny_vae_path": "/path/to/taew2_1.pth" // 轻量级 VAE 模型路径
}
```
> **配置说明**:
> - 量化权重和原始精度权重可以灵活混合使用,系统将根据配置自动选择对应的模型
> - 量化模式的选择取决于您的硬件支持情况,建议在 H100/A100 等高端 GPU 上使用 FP8
> - 轻量级 VAE 可以显著提升推理速度,但可能略微影响生成质量
## 💡 最佳实践
- **使用SSD存储**:显著提升模型加载速度和推理性能
- **统一目录结构**:便于管理和切换不同模型版本
- **预留足够空间**:确保有足够的存储空间(建议至少200GB)
- **定期清理**:删除不需要的模型版本以节省空间
- **网络优化**:使用稳定的网络连接和下载工具
### 推荐配置
**完整模型用户**
- 下载完整模型,享受自动路径查找的便利
- 仅需配置量化方案和组件开关
- 推荐使用 bash 脚本快速启动
**存储空间受限用户**
- 选择性下载所需的量化版本
- 灵活混合使用量化和原始精度组件
- 使用 bash 脚本简化启动流程
**高级用户**
- 完全手动配置路径,实现最大灵活性
- 支持模型文件分散存储
- 可自定义 bash 脚本参数
### 性能优化建议
- **使用 SSD 存储**:显著提升模型加载速度和推理性能
- **选择合适的量化方案**
- FP8:适用于 H100/A100 等高端 GPU,精度高
- INT8:适用于通用 GPU,内存占用小
- **启用轻量级 VAE**`use_tiny_vae: true` 可提升推理速度
- **合理配置 CPU 卸载**`t5_cpu_offload: true` 可节省 GPU 内存
### 下载优化建议
- **使用 Hugging Face CLI**:比 git clone 更稳定,支持断点续传
- **选择性下载**:仅下载所需的量化版本,节省时间和存储空间
- **网络优化**:使用稳定的网络连接,必要时使用代理
- **断点续传**:使用 `--resume-download` 参数支持中断后继续下载
## 🚨 常见问题
### Q: 模型文件大,下载慢怎么办?
A: 使用国内镜像源、下载工具如 `aria2c`,或考虑使用云存储服务
### Q: 模型文件大,下载速度缓慢怎么办?
A: 建议使用选择性下载方式,仅下载所需的量化版本,或使用国内镜像源
### Q: 启动时提示模型路径不存在?
A: 检查模型是否已正确下载,验证路径配置是否正确
A: 检查模型是否已正确下载,验证路径配置是否正确,确认自动查找机制是否正常工作
### Q: 如何切换不同的模型版本
A: 修改启动命令中的模型路径参数,支持同时运行多个模型实例
### Q: 如何切换不同的量化方案
A: 修改配置文件中的 `mm_type`, `t5_quant_scheme`,`clip_quant_scheme`等参数,请参考[量化文档](../method_tutorials/quantization.md)
### Q: 模型加载速度很慢
A: 确保模型存储在SSD上,启用延迟加载功能,使用量化版本模型
### Q: 如何混合使用量化和原始精度组件
A: 通过 `t5_quantized``clip_quantized` 参数控制,并手动指定原始精度路径
### Q: 配置文件中的路径如何设置?
A: 按照推荐目录结构组织后,大部分路径配置可省略,代码会自动处理
A: 推荐使用自动路径查找,如需手动配置请参考"手动配置"部分
### Q: 如何验证自动路径查找是否正常工作?
A: 查看启动日志,代码将输出实际使用的模型路径
### Q: bash 脚本启动失败怎么办?
A: 检查脚本中的路径配置是否正确,确保 `lightx2v_path``model_path` 变量已正确设置
## 📚 相关链接
- [Lightx2v官方模型仓库](https://huggingface.co/lightx2v)
- [Gradio部署指南](./deploy_gradio.md)
- [LightX2V 官方模型仓库](https://huggingface.co/lightx2v)
- [Gradio 部署指南](./deploy_gradio.md)
- [配置文件示例](https://github.com/ModelTC/LightX2V/tree/main/configs)
---
通过合理的模型文件组织,用户可以享受到"一键启动"的便捷体验,无需手动配置复杂的路径参数。建议按照本文档的推荐结构组织模型文件,并充分利用SSD存储的优势
通过科学的模型文件组织和灵活的配置选项,LightX2V 支持多种使用场景。完整模型下载提供最大的便利性,选择性下载节省存储空间,手动配置提供最大的灵活性。自动路径查找机制确保用户无需记忆复杂的路径配置,同时保持系统的可扩展性
# 模型量化
lightx2v支持对`Dit`中的线性层进行量化推理,支持`w8a8-int8`, `w8a8-fp8`, `w8a8-fp8block`, `w8a8-mxfp8``w4a4-nvfp4`的矩阵乘法。
LightX2V支持对`Dit`中的线性层进行量化推理,支持`w8a8-int8``w8a8-fp8``w8a8-fp8block``w8a8-mxfp8``w4a4-nvfp4`的矩阵乘法。同时,LightX2V也支持对T5和CLIP编码器进行量化,以进一步提升推理性能。
## 📊 量化方案概览
## 生产量化模型
### DIT 模型量化
LightX2V支持多种DIT矩阵乘法量化方案,通过配置文件中的`mm_type`参数进行配置:
#### 支持的 mm_type 类型
| mm_type | 权重量化 | 激活量化 | 计算内核 | 适用场景 |
|---------|----------|----------|----------|----------|
| `Default` | 无量化 | 无量化 | PyTorch | 精度优先 |
| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm` | FP8 通道对称 | FP8 通道动态对称 | VLLM | H100/A100高性能 |
| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm` | INT8 通道对称 | INT8 通道动态对称 | VLLM | 通用GPU兼容 |
| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F` | FP8 通道对称 | FP8 通道动态对称 | Q8F | 高性能推理 |
| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F` | INT8 通道对称 | INT8 通道动态对称 | Q8F | 高性能推理 |
| `W-fp8-block128-sym-A-fp8-channel-group128-sym-dynamic-Deepgemm` | FP8 块对称 | FP8 通道组对称 | DeepGEMM | 大模型优化 |
| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl` | FP8 通道对称 | FP8 通道动态对称 | SGL | 流式推理 |
#### 量化方案详细说明
**FP8 量化方案**
- **权重量化**:使用 `torch.float8_e4m3fn` 格式,按通道进行对称量化
- **激活量化**:动态量化,支持 per-token 和 per-channel 模式
- **优势**:在支持 FP8 的 GPU 上提供最佳性能,精度损失最小(通常<1%)
- **适用硬件**:H100、A100、RTX 40系列等支持FP8的GPU
**INT8 量化方案**
- **权重量化**:使用 `torch.int8` 格式,按通道进行对称量化
- **激活量化**:动态量化,支持 per-token 模式
- **优势**:兼容性最好,适用于大多数 GPU 硬件,内存占用减少约50%
- **适用硬件**:所有支持INT8的GPU
**块量化方案**
- **权重量化**:按 128x128 块进行 FP8 量化
- **激活量化**:按通道组(组大小128)进行量化
- **优势**:特别适合大模型,内存效率更高,支持更大的batch size
### T5 编码器量化
T5编码器支持以下量化方案:
#### 支持的 quant_scheme 类型
| quant_scheme | 量化精度 | 计算内核 | 适用场景 |
|--------------|----------|----------|----------|
| `int8` | INT8 | VLLM | 通用GPU |
| `fp8` | FP8 | VLLM | H100/A100 GPU |
| `int8-torchao` | INT8 | TorchAO | 兼容性优先 |
| `int8-q8f` | INT8 | Q8F | 高性能推理 |
| `fp8-q8f` | FP8 | Q8F | 高性能推理 |
#### T5量化特性
- **线性层量化**:量化注意力层和FFN层中的线性变换
- **动态量化**:激活在推理过程中动态量化,无需预计算
- **精度保持**:通过对称量化和缩放因子保持数值精度
### CLIP 编码器量化
CLIP编码器支持与T5相同的量化方案:
#### CLIP量化特性
- **视觉编码器量化**:量化Vision Transformer中的线性层
- **文本编码器量化**:量化文本编码器中的线性层
- **多模态对齐**:保持视觉和文本特征之间的对齐精度
## 🚀 生产量化模型
可通过[LightX2V 官方模型仓库](https://huggingface.co/lightx2v)下载量化模型,具体可参考[模型结构文档](../deploy_guides/model_structure.md)
使用LightX2V的convert工具,将模型转换成量化模型,参考[文档](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme_zh.md)
## 加载量化模型进行推理
## 📥 加载量化模型进行推理
### DIT 模型配置
将转换后的量化权重的路径,写到[配置文件](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization)中的`dit_quantized_ckpt`中。
通过指定--config_json到具体的config文件,即可以加载量化模型进行推理
```json
{
"dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}
}
```
### T5 编码器配置
```json
{
"t5_quantized": true,
"t5_quant_scheme": "fp8",
"t5_quantized_ckpt": "/path/to/t5_quantized_ckpt"
}
```
### CLIP 编码器配置
```json
{
"clip_quantized": true,
"clip_quant_scheme": "fp8",
"clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
}
```
### 完整配置示例
```json
{
"dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
"t5_quantized": true,
"t5_quant_scheme": "fp8",
"t5_quantized_ckpt": "/path/to/t5_quantized_ckpt",
"clip_quantized": true,
"clip_quant_scheme": "fp8",
"clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
}
```
通过指定`--config_json`到具体的config文件,即可以加载量化模型进行推理。
[这里](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization)有一些运行脚本供使用。
## 高阶量化功能
## 💡 量化方案选择建议
### 硬件兼容性
- **H100/A100 GPU/RTX 4090/RTX 4060**:推荐使用 FP8 量化方案
- DIT: `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm`
- T5/CLIP: `fp8`
- **A100/RTX 3090/RTX 3060**:推荐使用 INT8 量化方案
- DIT: `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm`
- T5/CLIP: `int8`
- **其他 GPU**:根据硬件支持情况选择
### 性能优化
- **内存受限**:选择 INT8 量化方案
- **速度优先**:选择 FP8 量化方案
- **精度要求高**:使用 FP8 或混合精度方案
### 混合量化策略
可以针对不同组件选择不同的量化方案:
```json
{
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
"t5_quantized": true,
"t5_quant_scheme": "int8",
"clip_quantized": true,
"clip_quant_scheme": "fp8"
}
```
## 🔧 高阶量化功能
### 量化算法调优
具体可参考量化工具[LightCompress的文档](https://github.com/ModelTC/llmc/blob/main/docs/zh_cn/source/backend/lightx2v.md)
### 自定义量化内核
LightX2V支持自定义量化内核,可以通过以下方式扩展:
1. **注册新的 mm_type**:在 `mm_weight.py` 中添加新的量化类
2. **实现量化函数**:定义权重和激活的量化方法
3. **集成计算内核**:使用自定义的矩阵乘法实现
## 🚨 重要注意事项
1. **硬件要求**:FP8 量化需要支持 FP8 的 GPU(如 H100、RTX40系)
2. **精度影响**:量化会带来一定的精度损失,需要根据应用场景权衡
3. **模型兼容性**:确保量化模型与推理代码版本兼容
4. **内存管理**:量化模型加载时注意内存使用情况
5. **量化校准**:建议使用代表性数据集进行量化校准以获得最佳效果
## 📚 相关资源
具体可参考量化工具[LLMC的文档](https://github.com/ModelTC/llmc/blob/main/docs/zh_cn/source/backend/lightx2v.md)
- [量化工具文档](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme_zh.md)
- [运行脚本](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization)
- [配置文件示例](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization)
- [LightCompress 量化文档](https://github.com/ModelTC/llmc/blob/main/docs/zh_cn/source/backend/lightx2v.md)
......@@ -48,6 +48,12 @@ class WanModel:
assert not self.config.get("lazy_load", False)
self.config.dit_quantized_ckpt = self.dit_quantized_ckpt
quant_config_path = os.path.join(self.config.dit_quantized_ckpt, "config.json")
if os.path.exists(quant_config_path):
with open(quant_config_path, "r") as f:
quant_model_config = json.load(f)
self.config.update(quant_model_config)
self.weight_auto_quant = self.config.mm_config.get("weight_auto_quant", False)
if self.dit_quantized:
assert self.weight_auto_quant or self.dit_quantized_ckpt is not None
......
......@@ -94,18 +94,20 @@ class WanRunner(DefaultRunner):
t5_model_name = f"models_t5_umt5-xxl-enc-{tmp_t5_quant_scheme}.pth"
t5_quantized_ckpt = find_torch_model_path(self.config, "t5_quantized_ckpt", t5_model_name, tmp_t5_quant_scheme)
t5_original_ckpt = None
tokenizer_path = os.path.join(os.path.dirname(t5_quantized_ckpt), "google/umt5-xxl")
else:
t5_quant_scheme = None
t5_quantized_ckpt = None
t5_model_name = "models_t5_umt5-xxl-enc-bf16.pth"
t5_original_ckpt = find_torch_model_path(self.config, "t5_original_ckpt", t5_model_name, "original")
tokenizer_path = os.path.join(os.path.dirname(t5_original_ckpt), "google/umt5-xxl")
text_encoder = T5EncoderModel(
text_len=self.config["text_len"],
dtype=torch.bfloat16,
device=t5_device,
checkpoint_path=t5_original_ckpt,
tokenizer_path=os.path.join(self.config.model_path, "google/umt5-xxl"),
tokenizer_path=tokenizer_path,
shard_fn=None,
cpu_offload=t5_offload,
offload_granularity=self.config.get("t5_offload_granularity", "model"),
......@@ -118,7 +120,7 @@ class WanRunner(DefaultRunner):
def load_vae_encoder(self):
vae_config = {
"vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth", "original"),
"vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth"),
"device": self.init_device,
"parallel": self.config.parallel_vae,
"use_tiling": self.config.get("use_tiling_vae", False),
......@@ -130,13 +132,13 @@ class WanRunner(DefaultRunner):
def load_vae_decoder(self):
vae_config = {
"vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth", "original"),
"vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth"),
"device": self.init_device,
"parallel": self.config.parallel_vae,
"use_tiling": self.config.get("use_tiling_vae", False),
}
if self.config.get("use_tiny_vae", False):
tiny_vae_path = find_torch_model_path(self.config, "tiny_vae_path", "taew2_1.pth", "original")
tiny_vae_path = find_torch_model_path(self.config, "tiny_vae_path", "taew2_1.pth")
vae_decoder = WanVAE_tiny(
vae_pth=tiny_vae_path,
device=self.init_device,
......
......@@ -37,7 +37,11 @@ def set_config(args):
with open(os.path.join(config.model_path, "config.json"), "r") as f:
model_config = json.load(f)
config.update(model_config)
elif os.path.exists(os.path.join(config.model_path, "original", "config.json")):
with open(os.path.join(config.model_path, "original", "config.json"), "r") as f:
model_config = json.load(f)
config.update(model_config)
# load quantized config
if config.get("dit_quantized_ckpt", None) is not None:
config_path = os.path.join(config.dit_quantized_ckpt, "config.json")
if os.path.exists(config_path):
......
......@@ -256,15 +256,19 @@ def save_to_video(
raise ValueError(f"Unknown save method: {method}")
def find_torch_model_path(config, ckpt_config_key=None, filename=None, subdir=None):
def find_torch_model_path(config, ckpt_config_key=None, filename=None, subdir=["original", "fp8", "int8"]):
if ckpt_config_key and config.get(ckpt_config_key, None) is not None:
return config.get(ckpt_config_key)
paths_to_check = [
os.path.join(config.model_path, filename),
]
if subdir:
if isinstance(subdir, list):
for sub in subdir:
paths_to_check.append(os.path.join(config.model_path, sub, filename))
else:
paths_to_check.append(os.path.join(config.model_path, subdir, filename))
for path in paths_to_check:
if os.path.exists(path):
logger.info(f"Found PyTorch model checkpoint: {path}")
......@@ -272,12 +276,15 @@ def find_torch_model_path(config, ckpt_config_key=None, filename=None, subdir=No
raise FileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
def find_hf_model_path(config, ckpt_config_key=None, subdir=None):
def find_hf_model_path(config, ckpt_config_key=None, subdir=["original", "fp8", "int8"]):
if ckpt_config_key and config.get(ckpt_config_key, None) is not None:
return config.get(ckpt_config_key)
paths_to_check = [config.model_path]
if subdir:
if isinstance(subdir, list):
for sub in subdir:
paths_to_check.append(os.path.join(config.model_path, sub))
else:
paths_to_check.append(os.path.join(config.model_path, subdir))
for path in paths_to_check:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment