Update docs and gradio

Update docs and gradio

Update docs and gradio
a8fdaaee · gushiqiao · GitHub · 39a4849a · ef43445c · a8fdaaee
Commit a8fdaaee authored Jul 28, 2025 by gushiqiao Committed by GitHub Jul 28, 2025
15 changed files
--- a/app/gradio_demo.py
+++ b/app/gradio_demo.py
@@ -11,6 +11,7 @@ from loguru import logger
 import importlib.util
 import psutil
 import random
+import glob

 logger.add(
    "inference_logs.log",
@@ -24,6 +25,40 @@ logger.add(
 MAX_NUMPY_SEED = 2**32 - 1


+def find_hf_model_path(model_path, subdir=["original", "fp8", "int8"]):
+    paths_to_check = [model_path]
+    if isinstance(subdir, list):
+        for sub in subdir:
+            paths_to_check.append(os.path.join(model_path, sub))
+    else:
+        paths_to_check.append(os.path.join(model_path, subdir))
+
+    for path in paths_to_check:
+        safetensors_pattern = os.path.join(path, "*.safetensors")
+        safetensors_files = glob.glob(safetensors_pattern)
+        if safetensors_files:
+            logger.info(f"Found Hugging Face model files in: {path}")
+            return path
+    raise FileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
+
+
+def find_torch_model_path(model_path, filename=None, subdir=["original", "fp8", "int8"]):
+    paths_to_check = [
+        os.path.join(model_path, filename),
+    ]
+    if isinstance(subdir, list):
+        for sub in subdir:
+            paths_to_check.append(os.path.join(model_path, sub, filename))
+    else:
+        paths_to_check.append(os.path.join(model_path, subdir, filename))
+    print(paths_to_check)
+    for path in paths_to_check:
+        if os.path.exists(path):
+            logger.info(f"Found PyTorch model checkpoint: {path}")
+            return path
+    raise FileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
+
+
 def generate_random_seed():
    return random.randint(0, MAX_NUMPY_SEED)

@@ -152,6 +187,50 @@ def is_ada_architecture_gpu():
        return False


+def get_quantization_options(model_path):
+    """Get quantization options dynamically based on model_path"""
+    import os
+
+    # Check subdirectories
+    subdirs = ["original", "fp8", "int8"]
+    has_subdirs = {subdir: os.path.exists(os.path.join(model_path, subdir)) for subdir in subdirs}
+
+    # Check original files in root directory
+    t5_bf16_exists = os.path.exists(os.path.join(model_path, "models_t5_umt5-xxl-enc-bf16.pth"))
+    clip_fp16_exists = os.path.exists(os.path.join(model_path, "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"))
+
+    # Generate options
+    def get_choices(has_subdirs, original_type, fp8_type, int8_type, fallback_type, has_original_file=False):
+        choices = []
+        if has_subdirs["original"]:
+            choices.append(original_type)
+        if has_subdirs["fp8"]:
+            choices.append(fp8_type)
+        if has_subdirs["int8"]:
+            choices.append(int8_type)
+
+        # If no subdirectories but original file exists, add original type
+        if not choices and has_original_file:
+            choices.append(original_type)
+
+        # If no options at all, use default value
+        if not choices:
+            choices = [fallback_type]
+
+        return choices, choices[0]
+
+    # DIT options
+    dit_choices, dit_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16")
+
+    # T5 options - check if original file exists
+    t5_choices, t5_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16", t5_bf16_exists)
+
+    # CLIP options - check if original file exists
+    clip_choices, clip_default = get_choices(has_subdirs, "fp16", "fp8", "int8", "fp16", clip_fp16_exists)
+
+    return {"dit_choices": dit_choices, "dit_default": dit_default, "t5_choices": t5_choices, "t5_default": t5_default, "clip_choices": clip_choices, "clip_default": clip_default}
+
+
 global_runner = None
 current_config = None
 cur_dit_quant_scheme = None
@@ -222,6 +301,8 @@ def run_inference(
    if os.path.exists(os.path.join(model_path, "config.json")):
        with open(os.path.join(model_path, "config.json"), "r") as f:
            model_config = json.load(f)
+    else:
+        model_config = {}

    if task == "t2v":
        if model_size == "1.3b":
@@ -305,17 +386,23 @@ def run_inference(
    is_dit_quant = dit_quant_scheme != "bf16"
    is_t5_quant = t5_quant_scheme != "bf16"
    if is_t5_quant:
-        t5_path = os.path.join(model_path, t5_quant_scheme)
-        t5_quant_ckpt = os.path.join(t5_path, f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth")
+        t5_model_name = f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth"
+        t5_quant_ckpt = find_torch_model_path(model_path, t5_model_name, t5_quant_scheme)
+        t5_original_ckpt = None
    else:
        t5_quant_ckpt = None
+        t5_model_name = "models_t5_umt5-xxl-enc-bf16.pth"
+        t5_original_ckpt = find_torch_model_path(model_path, t5_model_name, "original")

    is_clip_quant = clip_quant_scheme != "fp16"
    if is_clip_quant:
-        clip_path = os.path.join(model_path, clip_quant_scheme)
-        clip_quant_ckpt = os.path.join(clip_path, f"clip-{clip_quant_scheme}.pth")
+        clip_model_name = f"clip-{clip_quant_scheme}.pth"
+        clip_quant_ckpt = find_torch_model_path(model_path, clip_model_name, clip_quant_scheme)
+        clip_original_ckpt = None
    else:
        clip_quant_ckpt = None
+        clip_model_name = "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
+        clip_original_ckpt = find_torch_model_path(model_path, clip_model_name, "original")

    needs_reinit = (
        lazy_load
@@ -356,7 +443,7 @@ def run_inference(
            t5_quant_scheme = f"{t5_quant_scheme}-q8f"
            clip_quant_scheme = f"{clip_quant_scheme}-q8f"

-        dit_quantized_ckpt = os.path.join(model_path, dit_quant_scheme)
+        dit_quantized_ckpt = find_hf_model_path(model_path, dit_quant_scheme)
        if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
            with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
                quant_model_config = json.load(f)
@@ -394,15 +481,18 @@ def run_inference(
        "teacache_thresh": teacache_thresh,
        "t5_cpu_offload": t5_cpu_offload,
        "unload_modules": unload_modules,
+        "t5_original_ckpt": t5_original_ckpt,
        "t5_quantized": is_t5_quant,
        "t5_quantized_ckpt": t5_quant_ckpt,
        "t5_quant_scheme": t5_quant_scheme,
+        "clip_original_ckpt": clip_original_ckpt,
        "clip_quantized": is_clip_quant,
        "clip_quantized_ckpt": clip_quant_ckpt,
        "clip_quant_scheme": clip_quant_scheme,
+        "vae_path": find_torch_model_path(model_path, "Wan2.1_VAE.pth"),
        "use_tiling_vae": use_tiling_vae,
        "use_tiny_vae": use_tiny_vae,
-        "tiny_vae_path": (os.path.join(model_path, "taew2_1.pth") if use_tiny_vae else None),
+        "tiny_vae_path": (find_torch_model_path(model_path, "taew2_1.pth") if use_tiny_vae else None),
        "lazy_load": lazy_load,
        "do_mm_calib": False,
        "parallel_attn_type": None,
@@ -743,9 +833,6 @@ def auto_configure(enable_auto_config, resolution):


 def main():
-    def toggle_image_input(task):
-        return gr.update(visible=(task == "i2v"))
-
    with gr.Blocks(
        title="Lightx2v (Lightweight Video Inference and Generation Engine)",
        css="""
@@ -814,7 +901,7 @@ def main():
                                        lines=3,
                                        placeholder="What you don't want to appear in the video...",
                                        max_lines=5,
-                                        value="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+                                        value="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards",
                                    )
                                with gr.Column():
                                    resolution = gr.Dropdown(
@@ -1045,22 +1132,25 @@ def main():
                            info="Select the quantization matrix multiplication operator to accelerate inference",
                            interactive=True,
                        )
+                        # Get dynamic quantization options
+                        quant_options = get_quantization_options(model_path)
+
                        dit_quant_scheme = gr.Dropdown(
                            label="Dit",
-                            choices=["fp8", "int8", "bf16"],
-                            value="bf16",
+                            choices=quant_options["dit_choices"],
+                            value=quant_options["dit_default"],
                            info="Quantization precision for the Dit model",
                        )
                        t5_quant_scheme = gr.Dropdown(
                            label="T5 Encoder",
-                            choices=["fp8", "int8", "bf16"],
-                            value="bf16",
+                            choices=quant_options["t5_choices"],
+                            value=quant_options["t5_default"],
                            info="Quantization precision for the T5 Encoder model",
                        )
                        clip_quant_scheme = gr.Dropdown(
                            label="Clip Encoder",
-                            choices=["fp8", "int8", "fp16"],
-                            value="fp16",
+                            choices=quant_options["clip_choices"],
+                            value=quant_options["clip_default"],
                            info="Quantization precision for the Clip Encoder",
                        )
                        precision_mode = gr.Dropdown(

--- a/app/gradio_demo_zh.py
+++ b/app/gradio_demo_zh.py
@@ -11,6 +11,7 @@ from loguru import logger
 import importlib.util
 import psutil
 import random
+import glob

 logger.add(
    "inference_logs.log",
@@ -24,6 +25,40 @@ logger.add(
 MAX_NUMPY_SEED = 2**32 - 1


+def find_hf_model_path(model_path, subdir=["original", "fp8", "int8"]):
+    paths_to_check = [model_path]
+    if isinstance(subdir, list):
+        for sub in subdir:
+            paths_to_check.append(os.path.join(model_path, sub))
+    else:
+        paths_to_check.append(os.path.join(model_path, subdir))
+
+    for path in paths_to_check:
+        safetensors_pattern = os.path.join(path, "*.safetensors")
+        safetensors_files = glob.glob(safetensors_pattern)
+        if safetensors_files:
+            logger.info(f"Found Hugging Face model files in: {path}")
+            return path
+    raise FileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
+
+
+def find_torch_model_path(model_path, filename=None, subdir=["original", "fp8", "int8"]):
+    paths_to_check = [
+        os.path.join(model_path, filename),
+    ]
+    if isinstance(subdir, list):
+        for sub in subdir:
+            paths_to_check.append(os.path.join(model_path, sub, filename))
+    else:
+        paths_to_check.append(os.path.join(model_path, subdir, filename))
+    print(paths_to_check)
+    for path in paths_to_check:
+        if os.path.exists(path):
+            logger.info(f"Found PyTorch model checkpoint: {path}")
+            return path
+    raise FileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
+
+
 def generate_random_seed():
    return random.randint(0, MAX_NUMPY_SEED)

@@ -154,6 +189,50 @@ def is_ada_architecture_gpu():
        return False


+def get_quantization_options(model_path):
+    """根据model_path动态获取量化选项"""
+    import os
+
+    # 检查子目录
+    subdirs = ["original", "fp8", "int8"]
+    has_subdirs = {subdir: os.path.exists(os.path.join(model_path, subdir)) for subdir in subdirs}
+
+    # 检查根目录下的原始文件
+    t5_bf16_exists = os.path.exists(os.path.join(model_path, "models_t5_umt5-xxl-enc-bf16.pth"))
+    clip_fp16_exists = os.path.exists(os.path.join(model_path, "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"))
+
+    # 生成选项
+    def get_choices(has_subdirs, original_type, fp8_type, int8_type, fallback_type, has_original_file=False):
+        choices = []
+        if has_subdirs["original"]:
+            choices.append(original_type)
+        if has_subdirs["fp8"]:
+            choices.append(fp8_type)
+        if has_subdirs["int8"]:
+            choices.append(int8_type)
+
+        # 如果没有子目录但有原始文件，添加原始类型
+        if not choices and has_original_file:
+            choices.append(original_type)
+
+        # 如果没有任何选项，使用默认值
+        if not choices:
+            choices = [fallback_type]
+
+        return choices, choices[0]
+
+    # DIT选项
+    dit_choices, dit_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16")
+
+    # T5选项 - 检查是否有原始文件
+    t5_choices, t5_default = get_choices(has_subdirs, "bf16", "fp8", "int8", "bf16", t5_bf16_exists)
+
+    # CLIP选项 - 检查是否有原始文件
+    clip_choices, clip_default = get_choices(has_subdirs, "fp16", "fp8", "int8", "fp16", clip_fp16_exists)
+
+    return {"dit_choices": dit_choices, "dit_default": dit_default, "t5_choices": t5_choices, "t5_default": t5_default, "clip_choices": clip_choices, "clip_default": clip_default}
+
+
 global_runner = None
 current_config = None
 cur_dit_quant_scheme = None
@@ -224,6 +303,8 @@ def run_inference(
    if os.path.exists(os.path.join(model_path, "config.json")):
        with open(os.path.join(model_path, "config.json"), "r") as f:
            model_config = json.load(f)
+    else:
+        model_config = {}

    if task == "t2v":
        if model_size == "1.3b":
@@ -306,18 +387,26 @@ def run_inference(

    is_dit_quant = dit_quant_scheme != "bf16"
    is_t5_quant = t5_quant_scheme != "bf16"
+
    if is_t5_quant:
-        t5_path = os.path.join(model_path, t5_quant_scheme)
-        t5_quant_ckpt = os.path.join(t5_path, f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth")
+        t5_model_name = f"models_t5_umt5-xxl-enc-{t5_quant_scheme}.pth"
+        t5_quantized_ckpt = find_torch_model_path(model_path, t5_model_name, t5_quant_scheme)
+        t5_original_ckpt = None
    else:
-        t5_quant_ckpt = None
+        t5_quantized_ckpt = None
+        t5_model_name = "models_t5_umt5-xxl-enc-bf16.pth"
+        t5_original_ckpt = find_torch_model_path(model_path, t5_model_name, "original")

    is_clip_quant = clip_quant_scheme != "fp16"
+
    if is_clip_quant:
-        clip_path = os.path.join(model_path, clip_quant_scheme)
-        clip_quant_ckpt = os.path.join(clip_path, f"clip-{clip_quant_scheme}.pth")
+        clip_model_name = f"clip-{t5_quant_scheme}.pth"
+        clip_quantized_ckpt = find_torch_model_path(model_path, clip_model_name, clip_quant_scheme)
+        clip_original_ckpt = None
    else:
-        clip_quant_ckpt = None
+        clip_quantized_ckpt = None
+        clip_model_name = "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
+        clip_original_ckpt = find_torch_model_path(model_path, clip_model_name, "original")

    needs_reinit = (
        lazy_load
@@ -358,7 +447,7 @@ def run_inference(
            t5_quant_scheme = f"{t5_quant_scheme}-q8f"
            clip_quant_scheme = f"{clip_quant_scheme}-q8f"

-        dit_quantized_ckpt = os.path.join(model_path, dit_quant_scheme)
+        dit_quantized_ckpt = find_hf_model_path(model_path, dit_quant_scheme)
        if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
            with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
                quant_model_config = json.load(f)
@@ -394,17 +483,20 @@ def run_inference(
        "coefficients": coefficient[0] if use_ret_steps else coefficient[1],
        "use_ret_steps": use_ret_steps,
        "teacache_thresh": teacache_thresh,
+        "t5_original_ckpt": t5_original_ckpt,
        "t5_cpu_offload": t5_cpu_offload,
        "unload_modules": unload_modules,
        "t5_quantized": is_t5_quant,
-        "t5_quantized_ckpt": t5_quant_ckpt,
+        "t5_quantized_ckpt": t5_quantized_ckpt,
        "t5_quant_scheme": t5_quant_scheme,
+        "clip_original_ckpt": clip_original_ckpt,
        "clip_quantized": is_clip_quant,
-        "clip_quantized_ckpt": clip_quant_ckpt,
+        "clip_quantized_ckpt": clip_quantized_ckpt,
        "clip_quant_scheme": clip_quant_scheme,
+        "vae_path": find_torch_model_path(model_path, "Wan2.1_VAE.pth"),
        "use_tiling_vae": use_tiling_vae,
        "use_tiny_vae": use_tiny_vae,
-        "tiny_vae_path": (os.path.join(model_path, "taew2_1.pth") if use_tiny_vae else None),
+        "tiny_vae_path": (find_torch_model_path(model_path, "taew2_1.pth") if use_tiny_vae else None),
        "lazy_load": lazy_load,
        "do_mm_calib": False,
        "parallel_attn_type": None,
@@ -745,9 +837,6 @@ def auto_configure(enable_auto_config, resolution):


 def main():
-    def toggle_image_input(task):
-        return gr.update(visible=(task == "i2v"))
-
    with gr.Blocks(
        title="Lightx2v (轻量级视频推理和生成引擎)",
        css="""
@@ -1045,22 +1134,25 @@ def main():
                            info="选择量化矩阵乘法算子以加速推理",
                            interactive=True,
                        )
+                        # 获取动态量化选项
+                        quant_options = get_quantization_options(model_path)
+
                        dit_quant_scheme = gr.Dropdown(
                            label="Dit",
-                            choices=["fp8", "int8", "bf16"],
-                            value="bf16",
+                            choices=quant_options["dit_choices"],
+                            value=quant_options["dit_default"],
                            info="Dit模型的量化精度",
                        )
                        t5_quant_scheme = gr.Dropdown(
                            label="T5编码器",
-                            choices=["fp8", "int8", "bf16"],
-                            value="bf16",
+                            choices=quant_options["t5_choices"],
+                            value=quant_options["t5_default"],
                            info="T5编码器模型的量化精度",
                        )
                        clip_quant_scheme = gr.Dropdown(
                            label="Clip编码器",
-                            choices=["fp8", "int8", "fp16"],
-                            value="fp16",
+                            choices=quant_options["clip_choices"],
+                            value=quant_options["clip_default"],
                            info="Clip编码器的量化精度",
                        )
                        precision_mode = gr.Dropdown(

--- a/app/run_gradio.sh
+++ b/app/run_gradio.sh
@@ -14,11 +14,11 @@

 # Lightx2v project root directory path
 # Example: /home/user/lightx2v or /data/video_gen/lightx2v
-lightx2v_path=/path/to/lightx2v
+lightx2v_path=/data/video_gen/LightX2V
 # Model path configuration
 # Image-to-video model path (for i2v tasks)
 # Example: /path/to/Wan2.1-I2V-14B-720P-Lightx2v
-i2v_model_path=/path/to/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v/
+i2v_model_path=/wan_0726/wan_test/fp8

 # Text-to-video model path (for t2v tasks)
 # Example: /path/to/Wan2.1-T2V-1.3B

--- a/docs/EN/source/deploy_guides/deploy_gradio.md
+++ b/docs/EN/source/deploy_guides/deploy_gradio.md
@@ -70,6 +70,20 @@ Install according to the project homepage tutorials for each operator as needed
 - **`wan2.1`**: Standard model, provides the best video generation quality, suitable for scenarios with extremely high quality requirements
 - **`wan2.1_distill`**: Distilled model, optimized through knowledge distillation technology, significantly improves inference speed, maintains good quality while greatly reducing computation time, suitable for most application scenarios

+**📥 Model Download**:
+
+Refer to the [Model Structure Documentation](./model_structure.md) to download complete models (including quantized and non-quantized versions) or download only quantized/non-quantized versions.
+
+**Download Options**:
+
+- **Complete Model**: When downloading complete models with both quantized and non-quantized versions, you can freely choose the quantization precision for DIT/T5/CLIP in the advanced options of the `Gradio` Web frontend.
+
+- **Non-quantized Version Only**: When downloading only non-quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to bf16/fp16. If you need to use quantized versions of models, please manually download quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
+
+- **Quantized Version Only**: When downloading only quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to fp8 or int8 (depending on the weights you downloaded). If you need to use non-quantized versions of models, please manually download non-quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
+
+- **Note**: Whether you download complete models or partial models, the values for `i2v_model_path` and `t2v_model_path` parameters should be the first-level directory paths. For example: `Wan2.1-I2V-14B-480P-Lightx2v/`, not `Wan2.1-I2V-14B-480P-Lightx2v/int8`.
+
 ### Startup Methods

 #### Method 1: Using Startup Script (Recommended)

--- a/docs/EN/source/deploy_guides/deploy_local_windows.md
+++ b/docs/EN/source/deploy_guides/deploy_local_windows.md
@@ -41,6 +41,7 @@ After extraction, ensure the directory structure is as follows:
 ├── start_lightx2v.bat          # One-click startup script
 ├── lightx2v_config.txt         # Configuration file
 ├── LightX2V使用说明.txt         # LightX2V usage instructions
+├── outputs/                    # Generated video save directory
 └── models/                     # Model storage directory
    ├── 说明.txt                       # Model documentation
    ├── Wan2.1-I2V-14B-480P-Lightx2v/  # Image-to-video model (480P)
@@ -52,6 +53,20 @@ After extraction, ensure the directory structure is as follows:
    └── Wan2.1-T2V-14B-StepDistill-CfgDistill-Lightx2v/      # Text-to-video model (4-step distillation)
 ```

+**📥 Model Download**:
+
+Refer to the [Model Structure Documentation](./model_structure.md) to download complete models (including quantized and non-quantized versions) or download only quantized/non-quantized versions.
+
+**Download Options**:
+
+- **Complete Model**: When downloading complete models with both quantized and non-quantized versions, you can freely choose the quantization precision for DIT/T5/CLIP in the advanced options of the `Gradio` Web frontend.
+
+- **Non-quantized Version Only**: When downloading only non-quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to bf16/fp16. If you need to use quantized versions of models, please manually download quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
+
+- **Quantized Version Only**: When downloading only quantized versions, in the `Gradio` Web frontend, the quantization precision for `DIT/T5/CLIP` can only be set to fp8 or int8 (depending on the weights you downloaded). If you need to use non-quantized versions of models, please manually download non-quantized weights to the `i2v_model_path` or `t2v_model_path` directory where Gradio is started.
+
+- **Note**: Whether you download complete models or partial models, the values for `i2v_model_path` and `t2v_model_path` parameters should be the first-level directory paths. For example: `Wan2.1-I2V-14B-480P-Lightx2v/`, not `Wan2.1-I2V-14B-480P-Lightx2v/int8`.
+
 **📋 Configuration Parameters**

 Edit the `lightx2v_config.txt` file and modify the following parameters as needed:
@@ -74,6 +89,12 @@ model_size=14b

 # Model class (wan2.1: standard model, wan2.1_distill: distilled model)
 model_cls=wan2.1
+
+# Image-to-video model path
+i2v_model_path=models/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v
+
+# Text-to-video model path
+t2v_model_path=models/Wan2.1-T2V-1.3B-Lightx2v
 ```

 **⚠️ Important Note**: If using distilled models (model names containing StepDistill-CfgDistil field), please set `model_cls` to `wan2.1_distill`

--- a/docs/EN/source/deploy_guides/model_structure.md
+++ b/docs/EN/source/deploy_guides/model_structure.md
-# Model Structure Introduction
+# Model Structure Guide

 ## 📖 Overview

-This document introduces the model directory structure of the Lightx2v project, helping users correctly organize model files for a convenient user experience. Through proper directory organization, users can enjoy the convenience of "one-click startup" without manually configuring complex path parameters.
+This document provides a comprehensive introduction to the model directory structure of the LightX2V project, designed to help users efficiently organize model files and achieve a convenient user experience. Through scientific directory organization, users can enjoy the convenience of "one-click startup" without manually configuring complex path parameters. Meanwhile, the system also supports flexible manual path configuration to meet the diverse needs of different user groups.

 ## 🗂️ Model Directory Structure

-### Lightx2v Official Model List
+### LightX2V Official Model List

-View all available models: [Lightx2v Official Model Repository](https://huggingface.co/lightx2v)
+View all available models: [LightX2V Official Model Repository](https://huggingface.co/lightx2v)

 ### Standard Directory Structure

-Using `Wan2.1-I2V-14B-480P-Lightx2v` as an example:
+Using `Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V` as an example, the standard file structure is as follows:

 ```
-Model Root Directory/
-├── Wan2.1-I2V-14B-480P-Lightx2v/
-│   ├── config.json                                    # Model configuration file
-│   ├── Wan2.1_VAE.pth                                # VAE variational autoencoder
-│   ├── models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth  # CLIP visual encoder (FP16)
-│   ├── models_t5_umt5-xxl-enc-bf16.pth               # T5 text encoder (BF16)
-│   ├── taew2_1.pth                                   # Lightweight VAE (optional)
-│   ├── fp8/                                          # FP8 quantized version (DIT/T5/CLIP)
-│   ├── int8/                                         # INT8 quantized version (DIT/T5/CLIP)
-│   ├── original/                                     # Original precision version (DIT)
-│   ├── xlm-roberta-large/
-│   └── google/
+Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/
+├── fp8/                                          # FP8 quantized version (DIT/T5/CLIP)
+│   ├── block_xx.safetensors                      # DIT model FP8 quantized version
+│   ├── models_t5_umt5-xxl-enc-fp8.pth            # T5 encoder FP8 quantized version
+│   ├── clip-fp8.pth                              # CLIP encoder FP8 quantized version
+│   ├── Wan2.1_VAE.pth                            # VAE variational autoencoder
+│   ├── taew2_1.pth                               # Lightweight VAE (optional)
+│   └── config.json                               # Model configuration file
+├── int8/                                         # INT8 quantized version (DIT/T5/CLIP)
+│   ├── block_xx.safetensors                      # DIT model INT8 quantized version
+│   ├── models_t5_umt5-xxl-enc-int8.pth           # T5 encoder INT8 quantized version
+│   ├── clip-int8.pth                             # CLIP encoder INT8 quantized version
+│   ├── Wan2.1_VAE.pth                            # VAE variational autoencoder
+│   ├── taew2_1.pth                               # Lightweight VAE (optional)
+│   └── config.json                               # Model configuration file
+├── original/                                     # Original precision version (DIT/T5/CLIP)
+│   ├── distill_model.safetensors                 # DIT model original precision version
+│   ├── models_t5_umt5-xxl-enc-bf16.pth           # T5 encoder original precision version
+│   ├── models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth  # CLIP encoder original precision version
+│   ├── Wan2.1_VAE.pth                            # VAE variational autoencoder
+│   ├── taew2_1.pth                               # Lightweight VAE (optional)
+│   └── config.json                               # Model configuration file
 ```

 ### 💾 Storage Recommendations

-**Strongly recommend storing model files on SSD solid-state drives** to significantly improve model loading speed and inference performance.
+**It is strongly recommended to store model files on SSD solid-state drives**, as this can significantly improve model loading speed and inference performance.

 **Recommended storage paths**:
 ```bash
@@ -40,88 +50,302 @@ Model Root Directory/
 /opt/models/              # System optimization directory
 ```

-### Quantized Version Directories
+### Quantization Version Description

-Each model contains multiple quantized versions for different hardware configurations:
+Each model includes multiple quantized versions to adapt to different hardware configuration requirements:
+- **FP8 Version**: Suitable for GPUs that support FP8 (such as H100, A100, RTX 40 series), providing optimal performance
+- **INT8 Version**: Suitable for most GPUs, balancing performance and compatibility, reducing memory usage by approximately 50%
+- **Original Precision Version**: Suitable for applications with extremely high precision requirements, providing highest quality output

+## 🚀 Usage Methods
+
+### Environment Setup
+
+#### Installing Hugging Face CLI
+
+Before starting to download models, please ensure that Hugging Face CLI is properly installed:
+
+```bash
+# Install huggingface_hub
+pip install huggingface_hub
+
+# Or install huggingface-cli
+pip install huggingface-cli
+
+# Login to Hugging Face (optional, but strongly recommended)
+huggingface-cli login
 ```
-Model Directory/
-├── fp8/                         # FP8 quantized version (H100/A100 high-end GPUs)
-├── int8/                        # INT8 quantized version (general GPUs)
-└── original/                    # Original precision version (DIT)
+
+### Method 1: Complete Model Download (Recommended)
+
+**Advantage**: After downloading the complete model, the system will automatically identify all component paths without manual configuration, providing a more convenient user experience
+
+#### 1. Download Complete Model
+
+```bash
+# Use Hugging Face CLI to download complete model
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V
 ```

-**💡 Using Full Precision Models**: To use full precision models, simply copy the official weight files to the `original/` directory.
+#### 2. Start Inference

-## 🚀 Usage Methods
+##### Bash Script Startup
+
+###### Scenario 1: Using Full Precision Model
+
+Modify the configuration in the [run script](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh):
+- `model_path`: Set to the downloaded model path `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`
+- `lightx2v_path`: Set to the LightX2V project root directory path
+
+###### Scenario 2: Using Quantized Model
+
+When using the complete model, if you need to enable quantization, please add the following configuration to the [configuration file](https://github.com/ModelTC/LightX2V/tree/main/configs/distill/wan_i2v_distill_4step_cfg.json):
+
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },                              // DIT model quantization scheme
+    "t5_quantized": true,           // Enable T5 quantization
+    "t5_quant_scheme": "fp8",       // T5 quantization mode
+    "clip_quantized": true,         // Enable CLIP quantization
+    "clip_quant_scheme": "fp8"      // CLIP quantization mode
+}
+```

-### Gradio Interface Startup
+> **Important Note**: Quantization configurations for each model can be flexibly combined. Quantization paths do not need to be manually specified, as the system will automatically locate the quantized versions of each model.

-When using the Gradio interface, simply specify the model root directory path:
+For detailed explanation of quantization technology, please refer to the [Quantization Documentation](../method_tutorials/quantization.md).
+
+Use the provided bash script for quick startup:
+
+```bash
+cd LightX2V/scripts
+bash wan/run_wan_t2v_distill_4step_cfg.sh
+```
+
+##### Gradio Interface Startup
+
+When performing inference through the Gradio interface, simply specify the model root directory path at startup, and lightweight VAE can be flexibly selected through frontend interface buttons:

 ```bash
-# Image to Video (I2V)
-python gradio_demo_zh.py \
-    --model_path /path/to/Wan2.1-I2V-14B-480P-Lightx2v \
+# Image-to-video inference (I2V)
+python gradio_demo.py \
+    --model_path ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
    --model_size 14b \
-    --task i2v
+    --task i2v \
+    --model_cls wan2.1_distill
+```
+
+### Method 2: Selective Download
+
+**Advantage**: Only download the required versions (quantized or non-quantized), effectively saving storage space and download time
+
+#### 1. Selective Download
+
+```bash
+# Use Hugging Face CLI to selectively download non-quantized version
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --include "original/*"
+```
+
+```bash
+# Use Hugging Face CLI to selectively download FP8 quantized version
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --include "fp8/*"
+```
+
+```bash
+# Use Hugging Face CLI to selectively download INT8 quantized version
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --include "int8/*"
+```

-# Text to Video (T2V)
-python gradio_demo_zh.py \
-    --model_path /path/to/Wan2.1-T2V-14B-Lightx2v \
+> **Important Note**: When starting inference scripts or Gradio, the `model_path` parameter still needs to be specified as the complete path without the `--include` parameter. For example: `model_path=./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`, not `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/int8`.
+
+#### 2. Start Inference
+
+**Taking the model with only FP8 version downloaded as an example:**
+
+##### Bash Script Startup
+
+###### Scenario 1: Using FP8 DIT + FP8 T5 + FP8 CLIP
+
+Set the `model_path` in the [run script](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh) to your downloaded model path `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/`, and set `lightx2v_path` to your LightX2V project path.
+
+Only need to modify the quantized model configuration in the configuration file as follows:
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },                              // DIT quantization scheme
+    "t5_quantized": true,           // Whether to use T5 quantized version
+    "t5_quant_scheme": "fp8",       // T5 quantization mode
+    "clip_quantized": true,         // Whether to use CLIP quantized version
+    "clip_quant_scheme": "fp8",     // CLIP quantization mode
+}
+```
+
+> **Important Note**: At this time, each model can only be specified as a quantized version. Quantization paths do not need to be manually specified, as the system will automatically locate the quantized versions of each model.
+
+###### Scenario 2: Using FP8 DIT + Original Precision T5 + Original Precision CLIP
+
+Set the `model_path` in the [run script](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh) to your downloaded model path `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`, and set `lightx2v_path` to your LightX2V project path.
+
+Since only quantized weights were downloaded, you need to manually download the original precision versions of T5 and CLIP, and configure them in the configuration file's `t5_original_ckpt` and `clip_original_ckpt` as follows:
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },                              // DIT quantization scheme
+    "t5_original_ckpt": "/path/to/models_t5_umt5-xxl-enc-bf16.pth",
+    "clip_original_ckpt": "/path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
+}
+```
+
+Use the provided bash script for quick startup:
+
+```bash
+cd LightX2V/scripts
+bash wan/run_wan_t2v_distill_4step_cfg.sh
+```
+
+##### Gradio Interface Startup
+
+When performing inference through the Gradio interface, specify the model root directory path at startup:
+
+```bash
+# Image-to-video inference (I2V)
+python gradio_demo.py \
+    --model_path ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/ \
    --model_size 14b \
-    --task t2v
+    --task i2v \
+    --model_cls wan2.1_distill
+```
+
+> **Important Note**: Since the model root directory only contains quantized versions of each model, when using the frontend, the quantization precision for DIT/T5/CLIP models can only be selected as fp8. If you need to use non-quantized versions of T5/CLIP, please manually download non-quantized weights and place them in the gradio_demo model_path directory (`./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/`). In this case, the T5/CLIP quantization precision can be set to bf16/fp16.
+
+### Method 3: Manual Configuration
+
+Users can flexibly configure quantization options and paths for each component according to actual needs, achieving mixed use of quantized and non-quantized components. Please ensure that the required model weights have been correctly downloaded and placed in the specified paths.
+
+#### DIT Model Configuration
+
+```json
+{
+    "dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",    // DIT quantized weights path
+    "dit_original_ckpt": "/path/to/dit_original_ckpt",      // DIT original precision weights path
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"  // DIT matrix multiplication operator type, specify as "Default" when not quantized
+    }
+}
 ```

-### Configuration File Startup
+#### T5 Model Configuration

-When starting with configuration files, such as [configuration file](https://github.com/ModelTC/LightX2V/tree/main/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json), the following path configurations can be omitted:
+```json
+{
+    "t5_quantized_ckpt": "/path/to/t5_quantized_ckpt",      // T5 quantized weights path
+    "t5_original_ckpt": "/path/to/t5_original_ckpt",        // T5 original precision weights path
+    "t5_quantized": true,                                   // Whether to enable T5 quantization
+    "t5_quant_scheme": "fp8"                                // T5 quantization mode, only effective when t5_quantized is true
+}
+```

- `dit_quantized_ckpt`: No need to specify, code will automatically search in the model directory
- `tiny_vae_path`: No need to specify, code will automatically search in the model directory
- `clip_quantized_ckpt`: No need to specify, code will automatically search in the model directory
- `t5_quantized_ckpt`: No need to specify, code will automatically search in the model directory
+#### CLIP Model Configuration

-**💡 Simplified Configuration**: After organizing model files according to the recommended directory structure, most path configurations can be omitted as the code will handle them automatically.
+```json
+{
+    "clip_quantized_ckpt": "/path/to/clip_quantized_ckpt",  // CLIP quantized weights path
+    "clip_original_ckpt": "/path/to/clip_original_ckpt",    // CLIP original precision weights path
+    "clip_quantized": true,                                 // Whether to enable CLIP quantization
+    "clip_quant_scheme": "fp8"                              // CLIP quantization mode, only effective when clip_quantized is true
+}
+```

-### Manual Download
+#### VAE Model Configuration

-1. Visit the [Hugging Face Model Page](https://huggingface.co/lightx2v)
-2. Select the required model version
-3. Download all files to the corresponding directory
+```json
+{
+    "vae_pth": "/path/to/Wan2.1_VAE.pth",                   // Original VAE model path
+    "use_tiny_vae": true,                                   // Whether to use lightweight VAE
+    "tiny_vae_path": "/path/to/taew2_1.pth"                 // Lightweight VAE model path
+}
+```

-**💡 Download Recommendations**: It is recommended to use SSD storage and ensure stable network connection. For large files, you can use `git lfs` or download tools such as `aria2c`.
+> **Configuration Notes**:
+> - Quantized weights and original precision weights can be flexibly mixed and used, and the system will automatically select the corresponding model based on the configuration
+> - The choice of quantization mode depends on your hardware support, it is recommended to use FP8 on high-end GPUs like H100/A100
+> - Lightweight VAE can significantly improve inference speed but may slightly affect generation quality

 ## 💡 Best Practices

+### Recommended Configurations
+
+**Complete Model Users**:
+- Download complete models to enjoy the convenience of automatic path discovery
+- Only need to configure quantization schemes and component switches
+- Recommended to use bash scripts for quick startup
+
+**Storage Space Limited Users**:
+- Selectively download required quantized versions
+- Flexibly mix and use quantized and original precision components
+- Use bash scripts to simplify startup process
+
+**Advanced Users**:
+- Completely manual path configuration for maximum flexibility
+- Support scattered storage of model files
+- Can customize bash script parameters
+
+### Performance Optimization Recommendations
+
 - **Use SSD Storage**: Significantly improve model loading speed and inference performance
- **Unified Directory Structure**: Facilitate management and switching between different model versions
- **Reserve Sufficient Space**: Ensure adequate storage space (recommended at least 200GB)
- **Regular Cleanup**: Delete unnecessary model versions to save space
- **Network Optimization**: Use stable network connections and download tools
+- **Choose Appropriate Quantization Schemes**:
+  - FP8: Suitable for high-end GPUs like H100/A100, high precision
+  - INT8: Suitable for general GPUs, small memory footprint
+- **Enable Lightweight VAE**: `use_tiny_vae: true` can improve inference speed
+- **Reasonable CPU Offload Configuration**: `t5_cpu_offload: true` can save GPU memory

-## 🚨 Common Issues
+### Download Optimization Recommendations

-### Q: Model files are too large and download is slow?
-A: Use domestic mirror sources, download tools such as `aria2c`, or consider using cloud storage services
+- **Use Hugging Face CLI**: More stable than git clone, supports resume download
+- **Selective Download**: Only download required quantized versions, saving time and storage space
+- **Network Optimization**: Use stable network connections, use proxy when necessary
+- **Resume Download**: Use `--resume-download` parameter to support continuing download after interruption

-### Q: Model path not found when starting?
-A: Check if the model has been downloaded correctly and verify the path configuration
+## 🚨 Frequently Asked Questions

-### Q: How to switch between different model versions?
-A: Modify the model path parameter in the startup command, supports running multiple model instances simultaneously
+### Q: Model files are too large and download speed is slow, what should I do?
+A: It is recommended to use selective download method, only download required quantized versions, or use domestic mirror sources

-### Q: Model loading is very slow?
-A: Ensure models are stored on SSD, enable lazy loading, and use quantized version models
+### Q: Model path does not exist when starting up?
+A: Please check if the model has been correctly downloaded, verify if the path configuration is correct, and confirm if the automatic discovery mechanism is working properly
+
+### Q: How to switch between different quantization schemes?
+A: Modify parameters such as `mm_type`, `t5_quant_scheme`, `clip_quant_scheme` in the configuration file, please refer to the [Quantization Documentation](../method_tutorials/quantization.md)
+
+### Q: How to mix and use quantized and original precision components?
+A: Control through `t5_quantized` and `clip_quantized` parameters, and manually specify original precision paths

 ### Q: How to set paths in configuration files?
-A: After organizing according to the recommended directory structure, most path configurations can be omitted as the code will handle them automatically
+A: It is recommended to use automatic path discovery, for manual configuration please refer to the "Manual Configuration" section
+
+### Q: How to verify if automatic path discovery is working properly?
+A: Check the startup logs, the code will output the actual model paths being used
+
+### Q: What should I do if bash script startup fails?
+A: Check if the path configuration in the script is correct, ensure that `lightx2v_path` and `model_path` variables are correctly set

 ## 📚 Related Links

- [Lightx2v Official Model Repository](https://huggingface.co/lightx2v)
+- [LightX2V Official Model Repository](https://huggingface.co/lightx2v)
 - [Gradio Deployment Guide](./deploy_gradio.md)
+- [Configuration File Examples](https://github.com/ModelTC/LightX2V/tree/main/configs)

 ---

-Through proper model file organization, users can enjoy the convenience of "one-click startup" without manually configuring complex path parameters. It is recommended to organize model files according to the structure recommended in this document and fully utilize the advantages of SSD storage.
+Through scientific model file organization and flexible configuration options, LightX2V supports multiple usage scenarios. Complete model download provides maximum convenience, selective download saves storage space, and manual configuration provides maximum flexibility. The automatic path discovery mechanism ensures that users do not need to remember complex path configurations while maintaining system scalability.
--- a/docs/EN/source/method_tutorials/quantization.md
+++ b/docs/EN/source/method_tutorials/quantization.md
 # Model Quantization

-LightX2V supports quantization inference for linear layers in `Dit`, supporting `w8a8-int8`, `w8a8-fp8`, `w8a8-fp8block`, `w8a8-mxfp8`, and `w4a4-nvfp4` matrix multiplication.
+LightX2V supports quantization inference for linear layers in `Dit`, supporting `w8a8-int8`, `w8a8-fp8`, `w8a8-fp8block`, `w8a8-mxfp8`, and `w4a4-nvfp4` matrix multiplication. Additionally, LightX2V also supports quantization of T5 and CLIP encoders to further improve inference performance.

+## 📊 Quantization Scheme Overview

-## Producing Quantized Models
+### DIT Model Quantization
+
+LightX2V supports multiple DIT matrix multiplication quantization schemes, configured through the `mm_type` parameter:
+
+#### Supported mm_type Types
+
+| mm_type | Weight Quantization | Activation Quantization | Compute Kernel |
+|---------|-------------------|------------------------|----------------|
+| `Default` | No Quantization | No Quantization | PyTorch |
+| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm` | FP8 Channel Symmetric | FP8 Channel Dynamic Symmetric | VLLM |
+| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm` | INT8 Channel Symmetric | INT8 Channel Dynamic Symmetric | VLLM |
+| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F` | FP8 Channel Symmetric | FP8 Channel Dynamic Symmetric | Q8F |
+| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F` | INT8 Channel Symmetric | INT8 Channel Dynamic Symmetric | Q8F |
+| `W-fp8-block128-sym-A-fp8-channel-group128-sym-dynamic-Deepgemm` | FP8 Block Symmetric | FP8 Channel Group Symmetric | DeepGEMM |
+| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl` | FP8 Channel Symmetric | FP8 Channel Dynamic Symmetric | SGL |
+
+#### Detailed Quantization Scheme Description
+
+**FP8 Quantization Scheme**:
+- **Weight Quantization**: Uses `torch.float8_e4m3fn` format with per-channel symmetric quantization
+- **Activation Quantization**: Dynamic quantization supporting per-token and per-channel modes
+- **Advantages**: Provides optimal performance on FP8-supported GPUs with minimal precision loss (typically <1%)
+- **Compatible Hardware**: H100, A100, RTX 40 series and other FP8-supported GPUs
+
+**INT8 Quantization Scheme**:
+- **Weight Quantization**: Uses `torch.int8` format with per-channel symmetric quantization
+- **Activation Quantization**: Dynamic quantization supporting per-token mode
+- **Advantages**: Best compatibility, suitable for most GPU hardware, reduces memory usage by ~50%
+- **Compatible Hardware**: All INT8-supported GPUs
+
+**Block Quantization Scheme**:
+- **Weight Quantization**: FP8 quantization by 128x128 blocks
+- **Activation Quantization**: Quantization by channel groups (group size 128)
+- **Advantages**: Particularly suitable for large models with higher memory efficiency, supports larger batch sizes
+
+### T5 Encoder Quantization
+
+T5 encoder supports the following quantization schemes:
+
+#### Supported quant_scheme Types
+
+| quant_scheme | Quantization Precision | Compute Kernel |
+|--------------|----------------------|----------------|
+| `int8` | INT8 | VLLM |
+| `fp8` | FP8 | VLLM |
+| `int8-torchao` | INT8 | TorchAO |
+| `int8-q8f` | INT8 | Q8F |
+| `fp8-q8f` | FP8 | Q8F |
+
+### CLIP Encoder Quantization
+
+CLIP encoder supports the same quantization schemes as T5
+
+## 🚀 Producing Quantized Models
+
+Download quantized models from the [LightX2V Official Model Repository](https://huggingface.co/lightx2v), refer to the [Model Structure Documentation](../deploy_guides/model_structure.md) for details.

 Use LightX2V's convert tool to convert models into quantized models. Refer to the [documentation](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme.md).

-## Loading Quantized Models for Inference
+## 📥 Loading Quantized Models for Inference
+
+### DIT Model Configuration

 Write the path of the converted quantized weights to the `dit_quantized_ckpt` field in the [configuration file](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization).

-By specifying --config_json to the specific config file, you can load the quantized model for inference.
+```json
+{
+    "dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    }
+}
+```
+
+### T5 Encoder Configuration
+
+```json
+{
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8",
+    "t5_quantized_ckpt": "/path/to/t5_quantized_ckpt"
+}
+```
+
+### CLIP Encoder Configuration
+
+```json
+{
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8",
+    "clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
+}
+```
+
+### Complete Configuration Example
+
+```json
+{
+    "dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8",
+    "t5_quantized_ckpt": "/path/to/t5_quantized_ckpt",
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8",
+    "clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
+}
+```
+
+By specifying `--config_json` to the specific config file, you can load the quantized model for inference.

 [Here](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization) are some running scripts for use.

-## Advanced Quantization Features
+## 💡 Quantization Scheme Selection Recommendations
+
+### Hardware Compatibility
+
+- **H100/A100 GPU/RTX 4090/RTX 4060**: Recommended to use FP8 quantization schemes
+  - DIT: `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm`
+  - T5/CLIP: `fp8`
+- **A100/RTX 3090/RTX 3060**: Recommended to use INT8 quantization schemes
+  - DIT: `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm`
+  - T5/CLIP: `int8`
+- **Other GPUs**: Choose based on hardware support
+
+### Performance Optimization
+
+- **Memory Constrained**: Choose INT8 quantization schemes
+- **Speed Priority**: Choose FP8 quantization schemes
+- **High Precision Requirements**: Use FP8 or mixed precision schemes
+
+### Mixed Quantization Strategy
+
+You can choose different quantization schemes for different components:
+
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8",
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8"
+}
+```
+
+## 🔧 Advanced Quantization Features

 For details, please refer to the documentation of the quantization tool [LLMC](https://github.com/ModelTC/llmc/blob/main/docs/en/source/backend/lightx2v.md)
+
+### Custom Quantization Kernels
+
+LightX2V supports custom quantization kernels that can be extended in the following ways:
+
+1. **Register New mm_type**: Add new quantization classes in `mm_weight.py`
+2. **Implement Quantization Functions**: Define quantization methods for weights and activations
+3. **Integrate Compute Kernels**: Use custom matrix multiplication implementations
+
+## 🚨 Important Notes
+
+1. **Hardware Requirements**: FP8 quantization requires FP8-supported GPUs (such as H100, RTX 40 series)
+2. **Precision Impact**: Quantization will bring certain precision loss, which needs to be weighed based on application scenarios
+3. **Model Compatibility**: Ensure quantized models are compatible with inference code versions
+4. **Memory Management**: Pay attention to memory usage when loading quantized models
+5. **Quantization Calibration**: It is recommended to use representative datasets for quantization calibration to achieve optimal results
+
+## 📚 Related Resources
+
+- [Quantization Tool Documentation](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme.md)
+- [Running Scripts](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization)
+- [Configuration File Examples](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization)
+- [LLMC Quantization Documentation](https://github.com/ModelTC/llmc/blob/main/docs/en/source/backend/lightx2v.md)
--- a/docs/ZH_CN/source/deploy_guides/deploy_gradio.md
+++ b/docs/ZH_CN/source/deploy_guides/deploy_gradio.md
@@ -70,6 +70,20 @@ LightX2V/app/
 - **`wan2.1`**: 标准模型，提供最佳的视频生成质量，适合对质量要求极高的场景
 - **`wan2.1_distill`**: 蒸馏模型，通过知识蒸馏技术优化，推理速度显著提升，在保持良好质量的同时大幅减少计算时间，适合大多数应用场景

+**📥 下载模型**:
+
+可参考[模型结构文档](./model_structure.md)下载完整模型（包含量化和非量化版本）或仅下载量化/非量化版本。
+
+**下载选项说明**：
+
+- **完整模型**：下载包含量化和非量化版本的完整模型时，在`Gradio` Web前端的高级选项中可以自由选择DIT/T5/CLIP的量化精度。
+
+- **仅非量化版本**：仅下载非量化版本时，在`Gradio` Web前端中，`DIT/T5/CLIP`的量化精度只能选择bf16/fp16。如需使用量化版本的模型，请手动下载量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
+
+- **仅量化版本**：仅下载量化版本时，在`Gradio` Web前端中，`DIT/T5/CLIP`的量化精度只能选择fp8或int8（取决于您下载的权重）。如需使用非量化版本的模型，请手动下载非量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
+
+- **注意**：无论是下载了完整模型还是部分模型，`i2v_model_path` 和 `t2v_model_path` 参数的值都应该是一级目录的路径。例如：`Wan2.1-I2V-14B-480P-Lightx2v/`，而不是 `Wan2.1-I2V-14B-480P-Lightx2v/int8`。
+
 ### 启动方式

 #### 方式一：使用启动脚本（推荐）

--- a/docs/ZH_CN/source/deploy_guides/deploy_local_windows.md
+++ b/docs/ZH_CN/source/deploy_guides/deploy_local_windows.md
@@ -52,6 +52,20 @@
    └── Wan2.1-T2V-14B-StepDistill-CfgDistill-Lightx2v/      # 文本转视频模型（4步蒸馏）
 ```

+**📥 下载模型**:
+
+可参考[模型结构文档](./model_structure.md)下载完整模型（包含量化和非量化版本）或仅下载量化/非量化版本。
+
+**下载选项说明**：
+
+- **完整模型**：下载包含量化和非量化版本的完整模型时，在`Gradio` Web前端的高级选项中可以自由选择DIT/T5/CLIP的量化精度。
+
+- **仅非量化版本**：仅下载非量化版本时，在`Gradio` Web前端中，`DIT/T5/CLIP`的量化精度只能选择bf16/fp16。如需使用量化版本的模型，请手动下载量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
+
+- **仅量化版本**：仅下载量化版本时，在`Gradio` Web前端中，`DIT/T5/CLIP`的量化精度只能选择fp8或int8（取决于您下载的权重）。如需使用非量化版本的模型，请手动下载非量化权重到Gradio启动的`i2v_model_path`或者`t2v_model_path`目录下。
+
+- **注意**：无论是下载了完整模型还是部分模型，`i2v_model_path` 和 `t2v_model_path` 参数的值都应该是一级目录的路径。例如：`Wan2.1-I2V-14B-480P-Lightx2v/`，而不是 `Wan2.1-I2V-14B-480P-Lightx2v/int8`。
+
 **📋 配置参数**

 编辑 `lightx2v_config.txt` 文件，根据需要修改以下参数：
@@ -74,6 +88,12 @@ model_size=14b

 # 模型类别 (wan2.1: 标准模型, wan2.1_distill: 蒸馏模型)
 model_cls=wan2.1
+
+# 图像转视频模型路径
+i2v_model_path=models/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-Lightx2v
+
+# 文本转视频模型路径
+t2v_model_path=models/Wan2.1-T2V-1.3B-Lightx2v
 ```

 **⚠️ 重要提示**: 如果使用蒸馏模型（模型名称包含StepDistill-CfgDistil字段），请将`model_cls`设置为`wan2.1_distill`

--- a/docs/ZH_CN/source/deploy_guides/model_structure.md
+++ b/docs/ZH_CN/source/deploy_guides/model_structure.md
@@ -2,133 +2,350 @@

 ## 📖 概述

-本文档介绍 Lightx2v 项目的模型目录结构，帮助用户正确组织模型文件，实现便捷的使用体验。通过合理的目录组织，用户可以享受到"一键启动"的便利，无需手动配置复杂的路径参数。
+本文档全面介绍 LightX2V 项目的模型目录结构，旨在帮助用户高效组织模型文件，实现便捷的使用体验。通过科学的目录组织方式，用户可以享受"一键启动"的便利，无需手动配置复杂的路径参数。同时，系统也支持灵活的手动路径配置，满足不同用户群体的多样化需求。

 ## 🗂️ 模型目录结构

-### Lightx2v官方模型列表
-
-查看所有可用模型：[Lightx2v官方模型仓库](https://huggingface.co/lightx2v)
+### LightX2V 官方模型列表

+查看所有可用模型：[LightX2V 官方模型仓库](https://huggingface.co/lightx2v)

 ### 标准目录结构

-以 `Wan2.1-I2V-14B-480P-Lightx2v` 为例：
+以 `Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V` 为例，标准文件结构如下：

 ```
-模型根目录/
-├── Wan2.1-I2V-14B-480P-Lightx2v/
-│   ├── config.json                                    # 模型配置文件
-│   ├── Wan2.1_VAE.pth                                # VAE变分自编码器
-│   ├── models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth  # CLIP视觉编码器 (FP16)
-│   ├── models_t5_umt5-xxl-enc-bf16.pth               # T5文本编码器 (BF16)
-│   ├── taew2_1.pth                                   # 轻量级VAE (可选)
-│   ├── fp8/                                          # FP8量化版本 (DIT/T5/CLIP)
-│   ├── int8/                                         # INT8量化版本 (DIT/T5/CLIP)
-│   ├── original/                                     # 原始精度版本 (DIT)
-│   ├── xlm-roberta-large/
-│   └── google/
+Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/
+├── fp8/                                          # FP8 量化版本 (DIT/T5/CLIP)
+│   ├── block_xx.safetensors                      # DIT 模型 FP8 量化版本
+│   ├── models_t5_umt5-xxl-enc-fp8.pth            # T5 编码器 FP8 量化版本
+│   ├── clip-fp8.pth                              # CLIP 编码器 FP8 量化版本
+│   ├── Wan2.1_VAE.pth                            # VAE 变分自编码器
+│   ├── taew2_1.pth                               # 轻量级 VAE (可选)
+│   └── config.json                               # 模型配置文件
+├── int8/                                         # INT8 量化版本 (DIT/T5/CLIP)
+│   ├── block_xx.safetensors                      # DIT 模型 INT8 量化版本
+│   ├── models_t5_umt5-xxl-enc-int8.pth           # T5 编码器 INT8 量化版本
+│   ├── clip-int8.pth                             # CLIP 编码器 INT8 量化版本
+│   ├── Wan2.1_VAE.pth                            # VAE 变分自编码器
+│   ├── taew2_1.pth                               # 轻量级 VAE (可选)
+│   └── config.json                               # 模型配置文件
+├── original/                                     # 原始精度版本 (DIT/T5/CLIP)
+│   ├── distill_model.safetensors                 # DIT 模型原始精度版本
+│   ├── models_t5_umt5-xxl-enc-bf16.pth           # T5 编码器原始精度版本
+│   ├── models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth  # CLIP 编码器原始精度版本
+│   ├── Wan2.1_VAE.pth                            # VAE 变分自编码器
+│   ├── taew2_1.pth                               # 轻量级 VAE (可选)
+│   └── config.json                               # 模型配置文件
 ```

 ### 💾 存储建议

-**强烈建议将模型文件存储在SSD固态硬盘上**，可以显著提升模型加载速度和推理性能。
+**强烈建议将模型文件存储在 SSD 固态硬盘上**，此举可显著提升模型加载速度和推理性能。

 **推荐存储路径**：
 ```bash
-/mnt/ssd/models/          # 独立SSD挂载点
-/data/ssd/models/         # 数据SSD目录
+/mnt/ssd/models/          # 独立 SSD 挂载点
+/data/ssd/models/         # 数据 SSD 目录
 /opt/models/              # 系统优化目录
 ```

-## 🔧 模型文件说明
+### 量化版本说明
+
+每个模型均包含多个量化版本，适配不同硬件配置需求：
+- **FP8 版本**：适用于支持 FP8 的 GPU（如 H100、A100、RTX 40系列），提供最佳性能表现
+- **INT8 版本**：适用于大多数 GPU，在性能和兼容性间取得平衡，内存占用减少约50%
+- **原始精度版本**：适用于对精度要求极高的应用场景，提供最高质量输出
+
+## 🚀 使用方法
+
+### 环境准备

+#### 安装 Hugging Face CLI

-### 量化版本目录
+在开始下载模型之前，请确保已正确安装 Hugging Face CLI：

-每个模型都包含多个量化版本，用于不同硬件配置：
+```bash
+# 安装 huggingface_hub
+pip install huggingface_hub
+
+# 或者安装 huggingface-cli
+pip install huggingface-cli

+# 登录 Hugging Face（可选，但强烈推荐）
+huggingface-cli login
 ```
-模型目录/
-├── fp8/                         # FP8量化版本 (H100/A100等高端GPU)
-├── int8/                        # INT8量化版本 (通用GPU)
-└── original/                    # 原始精度版本 (DIT)
+
+### 方式一：完整模型下载（推荐）
+
+**优势**：下载完整模型后，系统将自动识别所有组件路径，无需手动配置，使用体验更加便捷
+
+#### 1. 下载完整模型
+
+```bash
+# 使用 Hugging Face CLI 下载完整模型
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V
 ```

-**💡 使用全精度模型**：如需使用全精度模型，只需将官方权重文件复制到 `original/` 目录即可。
+#### 2. 启动推理

-## 🚀 使用方法
+##### Bash 脚本启动
+
+###### 场景一：使用全精度模型
+
+修改[运行脚本](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh)中的配置：
+- `model_path`：设置为下载的模型路径 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`
+- `lightx2v_path`：设置为 `LightX2V` 项目根目录路径
+
+###### 场景二：使用量化模型
+
+当使用完整模型时，如需启用量化功能，请在[配置文件](https://github.com/ModelTC/LightX2V/tree/main/configs/distill/wan_i2v_distill_4step_cfg.json)中添加以下配置：
+
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },                              // DIT 模型量化方案
+    "t5_quantized": true,           // 启用 T5 量化
+    "t5_quant_scheme": "fp8",       // T5 量化模式
+    "clip_quantized": true,         // 启用 CLIP 量化
+    "clip_quant_scheme": "fp8"      // CLIP 量化模式
+}
+```

-### Gradio界面启动
+> **重要提示**：各模型的量化配置可以灵活组合。量化路径无需手动指定，系统将自动定位各模型的量化版本。

-使用Gradio界面时，只需指定模型根目录路径：
+有关量化技术的详细说明，请参考[量化文档](../method_tutorials/quantization.md)。
+
+使用提供的 bash 脚本快速启动：

 ```bash
-# 图像到视频 (I2V)
+cd LightX2V/scripts
+bash wan/run_wan_t2v_distill_4step_cfg.sh
+```
+
+##### Gradio 界面启动
+
+通过 Gradio 界面进行推理时，只需在启动时指定模型根目录路径，轻量级 VAE 等可通过前端界面按钮灵活选择：
+
+```bash
+# 图像到视频推理 (I2V)
 python gradio_demo_zh.py \
-    --model_path /path/to/Wan2.1-I2V-14B-480P-Lightx2v \
+    --model_path ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
    --model_size 14b \
-    --task i2v
+    --task i2v \
+    --model_cls wan2.1_distill
+```
+
+### 方式二：选择性下载
+
+**优势**：仅下载所需的版本（量化或非量化），有效节省存储空间和下载时间

-# 文本到视频 (T2V)
+#### 1. 选择性下载
+
+```bash
+# 使用 Hugging Face CLI 选择性下载非量化版本
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --include "original/*"
+```
+
+```bash
+# 使用 Hugging Face CLI 选择性下载 FP8 量化版本
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --include "fp8/*"
+```
+
+```bash
+# 使用 Hugging Face CLI 选择性下载 INT8 量化版本
+huggingface-cli download lightx2v/Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --local-dir ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V \
+    --include "int8/*"
+```
+
+> **重要提示**：当启动推理脚本或Gradio时，`model_path` 参数仍需要指定为不包含 `--include` 的完整路径。例如：`model_path=./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`，而不是 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/int8`。
+
+#### 2. 启动推理
+
+**以只下载了FP8版本的模型为例：**
+
+##### Bash 脚本启动
+
+###### 场景一：使用 FP8 DIT + FP8 T5 + FP8 CLIP
+
+将[运行脚本](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh)中的 `model_path` 指定为您下载好的模型路径 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/`，`lightx2v_path` 指定为您的 `LightX2V` 项目路径。
+
+仅需修改配置文件中的量化模型配置如下：
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },                              // DIT 的量化方案
+    "t5_quantized": true,           // 是否使用 T5 量化版本
+    "t5_quant_scheme": "fp8",       // T5 的量化模式
+    "clip_quantized": true,         // 是否使用 CLIP 量化版本
+    "clip_quant_scheme": "fp8",     // CLIP 的量化模式
+}
+```
+
+> **重要提示**：此时各模型只能指定为量化版本。量化路径无需手动指定，系统将自动定位各模型的量化版本。
+
+###### 场景二：使用 FP8 DIT + 原始精度 T5 + 原始精度 CLIP
+
+将[运行脚本](https://github.com/ModelTC/LightX2V/tree/main/scripts/wan/run_wan_i2v_distill_4step_cfg.sh)中的 `model_path` 指定为您下载好的模型路径 `./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V`，`lightx2v_path` 指定为您的 `LightX2V` 项目路径。
+
+由于仅下载了量化权重，需要手动下载 T5 和 CLIP 的原始精度版本，并在配置文件的 `t5_original_ckpt` 和 `clip_original_ckpt` 中配置如下：
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },                              // DIT 的量化方案
+    "t5_original_ckpt": "/path/to/models_t5_umt5-xxl-enc-bf16.pth",
+    "clip_original_ckpt": "/path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
+}
+```
+
+使用提供的 bash 脚本快速启动：
+
+```bash
+cd LightX2V/scripts
+bash wan/run_wan_t2v_distill_4step_cfg.sh
+```
+
+##### Gradio 界面启动
+
+通过 Gradio 界面进行推理时，启动时指定模型根目录路径：
+
+```bash
+# 图像到视频推理 (I2V)
 python gradio_demo_zh.py \
-    --model_path /path/to/models/Wan2.1-T2V-14B-Lightx2v \
+    --model_path ./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/ \
    --model_size 14b \
-    --task t2v
+    --task i2v \
+    --model_cls wan2.1_distill
 ```

-### 配置文件启动
+> **重要提示**：由于模型根目录下仅包含各模型的量化版本，前端使用时，对于 DIT/T5/CLIP 模型的量化精度只能选择 fp8。如需使用非量化版本的T5/CLIP，请手动下载非量化权重并放置到gradio_demo的model_path目录（`./Wan2.1-I2V-14B-480P-StepDistill-CfgDistill-LightX2V/`）下，此时T5/CLIP的量化精度可以选择bf16/fp16。

-使用配置文件启动时, 如[配置文件](https://github.com/ModelTC/LightX2V/tree/main/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json)中的以下路径配置可以省略：
+### 方式三：手动配置

- `dit_quantized_ckpt` 无需指定，代码会自动在模型目录下查找
- `tiny_vae_path`：无需指定，代码会自动在模型目录下查找
- `clip_quantized_ckpt`：无需指定，代码会自动在模型目录下查找
- `t5_quantized_ckpt`：无需指定，代码会自动在模型目录下查找
+用户可根据实际需求灵活配置各个组件的量化选项和路径，实现量化与非量化组件的混合使用。请确保所需的模型权重已正确下载并放置在指定路径。

-**💡 简化配置**：按照推荐的目录结构组织模型文件后，大部分路径配置都可以省略，代码会自动处理。
+#### DIT 模型配置

+```json
+{
+    "dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",    // DIT 量化权重路径
+    "dit_original_ckpt": "/path/to/dit_original_ckpt",      // DIT 原始精度权重路径
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"  // DIT 矩阵乘算子类型，非量化时指定为 "Default"
+    }
+}
+```

-### 手动下载
+#### T5 模型配置

-1. 访问 [Hugging Face模型页面](https://huggingface.co/lightx2v)
-2. 选择需要的模型版本
-3. 下载所有文件到对应目录
+```json
+{
+    "t5_quantized_ckpt": "/path/to/t5_quantized_ckpt",      // T5 量化权重路径
+    "t5_original_ckpt": "/path/to/t5_original_ckpt",        // T5 原始精度权重路径
+    "t5_quantized": true,                                   // 是否启用 T5 量化
+    "t5_quant_scheme": "fp8"                                // T5 量化模式，仅在 t5_quantized 为 true 时生效
+}
+```

-**💡 下载建议**：建议使用SSD存储，并确保网络连接稳定。对于大文件，可使用 `git lfs` 或下载工具如 `aria2c`。
+#### CLIP 模型配置

+```json
+{
+    "clip_quantized_ckpt": "/path/to/clip_quantized_ckpt",  // CLIP 量化权重路径
+    "clip_original_ckpt": "/path/to/clip_original_ckpt",    // CLIP 原始精度权重路径
+    "clip_quantized": true,                                 // 是否启用 CLIP 量化
+    "clip_quant_scheme": "fp8"                              // CLIP 量化模式，仅在 clip_quantized 为 true 时生效
+}
+```
+
+#### VAE 模型配置
+
+```json
+{
+    "vae_pth": "/path/to/Wan2.1_VAE.pth",                   // 原始 VAE 模型路径
+    "use_tiny_vae": true,                                   // 是否使用轻量级 VAE
+    "tiny_vae_path": "/path/to/taew2_1.pth"                 // 轻量级 VAE 模型路径
+}
+```

+> **配置说明**：
+> - 量化权重和原始精度权重可以灵活混合使用，系统将根据配置自动选择对应的模型
+> - 量化模式的选择取决于您的硬件支持情况，建议在 H100/A100 等高端 GPU 上使用 FP8
+> - 轻量级 VAE 可以显著提升推理速度，但可能略微影响生成质量

 ## 💡 最佳实践

- **使用SSD存储**：显著提升模型加载速度和推理性能
- **统一目录结构**：便于管理和切换不同模型版本
- **预留足够空间**：确保有足够的存储空间（建议至少200GB）
- **定期清理**：删除不需要的模型版本以节省空间
- **网络优化**：使用稳定的网络连接和下载工具
+### 推荐配置
+
+**完整模型用户**：
+- 下载完整模型，享受自动路径查找的便利
+- 仅需配置量化方案和组件开关
+- 推荐使用 bash 脚本快速启动
+
+**存储空间受限用户**：
+- 选择性下载所需的量化版本
+- 灵活混合使用量化和原始精度组件
+- 使用 bash 脚本简化启动流程
+
+**高级用户**：
+- 完全手动配置路径，实现最大灵活性
+- 支持模型文件分散存储
+- 可自定义 bash 脚本参数
+
+### 性能优化建议
+
+- **使用 SSD 存储**：显著提升模型加载速度和推理性能
+- **选择合适的量化方案**：
+  - FP8：适用于 H100/A100 等高端 GPU，精度高
+  - INT8：适用于通用 GPU，内存占用小
+- **启用轻量级 VAE**：`use_tiny_vae: true` 可提升推理速度
+- **合理配置 CPU 卸载**：`t5_cpu_offload: true` 可节省 GPU 内存
+
+### 下载优化建议
+
+- **使用 Hugging Face CLI**：比 git clone 更稳定，支持断点续传
+- **选择性下载**：仅下载所需的量化版本，节省时间和存储空间
+- **网络优化**：使用稳定的网络连接，必要时使用代理
+- **断点续传**：使用 `--resume-download` 参数支持中断后继续下载

 ## 🚨 常见问题

-### Q: 模型文件太大，下载很慢怎么办？
-A: 使用国内镜像源、下载工具如 `aria2c`，或考虑使用云存储服务
+### Q: 模型文件过大，下载速度缓慢怎么办？
+A: 建议使用选择性下载方式，仅下载所需的量化版本，或使用国内镜像源

 ### Q: 启动时提示模型路径不存在？
-A: 检查模型是否已正确下载，验证路径配置是否正确
+A: 请检查模型是否已正确下载，验证路径配置是否正确，确认自动查找机制是否正常工作

-### Q: 如何切换不同的模型版本？
-A: 修改启动命令中的模型路径参数，支持同时运行多个模型实例
+### Q: 如何切换不同的量化方案？
+A: 修改配置文件中的 `mm_type`, `t5_quant_scheme`,`clip_quant_scheme`等参数，请参考[量化文档](../method_tutorials/quantization.md)

-### Q: 模型加载速度很慢？
-A: 确保模型存储在SSD上，启用延迟加载功能，使用量化版本模型
+### Q: 如何混合使用量化和原始精度组件？
+A: 通过 `t5_quantized` 和 `clip_quantized` 参数控制，并手动指定原始精度路径

 ### Q: 配置文件中的路径如何设置？
-A: 按照推荐目录结构组织后，大部分路径配置可省略，代码会自动处理
+A: 推荐使用自动路径查找，如需手动配置请参考"手动配置"部分
+
+### Q: 如何验证自动路径查找是否正常工作？
+A: 查看启动日志，代码将输出实际使用的模型路径
+
+### Q: bash 脚本启动失败怎么办？
+A: 检查脚本中的路径配置是否正确，确保 `lightx2v_path` 和 `model_path` 变量已正确设置

 ## 📚 相关链接

- [Lightx2v官方模型仓库](https://huggingface.co/lightx2v)
- [Gradio部署指南](./deploy_gradio.md)
+- [LightX2V 官方模型仓库](https://huggingface.co/lightx2v)
+- [Gradio 部署指南](./deploy_gradio.md)
+- [配置文件示例](https://github.com/ModelTC/LightX2V/tree/main/configs)

 ---

-通过合理的模型文件组织，用户可以享受到"一键启动"的便捷体验，无需手动配置复杂的路径参数。建议按照本文档的推荐结构组织模型文件，并充分利用SSD存储的优势。
+通过科学的模型文件组织和灵活的配置选项，LightX2V 支持多种使用场景。完整模型下载提供最大的便利性，选择性下载节省存储空间，手动配置提供最大的灵活性。自动路径查找机制确保用户无需记忆复杂的路径配置，同时保持系统的可扩展性。
--- a/docs/ZH_CN/source/method_tutorials/quantization.md
+++ b/docs/ZH_CN/source/method_tutorials/quantization.md
 # 模型量化

-lightx2v支持对`Dit`中的线性层进行量化推理，支持`w8a8-int8`, `w8a8-fp8`, `w8a8-fp8block`, `w8a8-mxfp8` 和 `w4a4-nvfp4`的矩阵乘法。
+LightX2V支持对`Dit`中的线性层进行量化推理，支持`w8a8-int8`、`w8a8-fp8`、`w8a8-fp8block`、`w8a8-mxfp8`和`w4a4-nvfp4`的矩阵乘法。同时，LightX2V也支持对T5和CLIP编码器进行量化，以进一步提升推理性能。

+## 📊 量化方案概览

-## 生产量化模型
+### DIT 模型量化
+
+LightX2V支持多种DIT矩阵乘法量化方案，通过配置文件中的`mm_type`参数进行配置：
+
+#### 支持的 mm_type 类型
+
+| mm_type | 权重量化 | 激活量化 | 计算内核 | 适用场景 |
+|---------|----------|----------|----------|----------|
+| `Default` | 无量化 | 无量化 | PyTorch | 精度优先 |
+| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm` | FP8 通道对称 | FP8 通道动态对称 | VLLM | H100/A100高性能 |
+| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm` | INT8 通道对称 | INT8 通道动态对称 | VLLM | 通用GPU兼容 |
+| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F` | FP8 通道对称 | FP8 通道动态对称 | Q8F | 高性能推理 |
+| `W-int8-channel-sym-A-int8-channel-sym-dynamic-Q8F` | INT8 通道对称 | INT8 通道动态对称 | Q8F | 高性能推理 |
+| `W-fp8-block128-sym-A-fp8-channel-group128-sym-dynamic-Deepgemm` | FP8 块对称 | FP8 通道组对称 | DeepGEMM | 大模型优化 |
+| `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl` | FP8 通道对称 | FP8 通道动态对称 | SGL | 流式推理 |
+
+#### 量化方案详细说明
+
+**FP8 量化方案**：
+- **权重量化**：使用 `torch.float8_e4m3fn` 格式，按通道进行对称量化
+- **激活量化**：动态量化，支持 per-token 和 per-channel 模式
+- **优势**：在支持 FP8 的 GPU 上提供最佳性能，精度损失最小（通常<1%）
+- **适用硬件**：H100、A100、RTX 40系列等支持FP8的GPU
+
+**INT8 量化方案**：
+- **权重量化**：使用 `torch.int8` 格式，按通道进行对称量化
+- **激活量化**：动态量化，支持 per-token 模式
+- **优势**：兼容性最好，适用于大多数 GPU 硬件，内存占用减少约50%
+- **适用硬件**：所有支持INT8的GPU
+
+**块量化方案**：
+- **权重量化**：按 128x128 块进行 FP8 量化
+- **激活量化**：按通道组（组大小128）进行量化
+- **优势**：特别适合大模型，内存效率更高，支持更大的batch size
+
+### T5 编码器量化
+
+T5编码器支持以下量化方案：
+
+#### 支持的 quant_scheme 类型
+
+| quant_scheme | 量化精度 | 计算内核 | 适用场景 |
+|--------------|----------|----------|----------|
+| `int8` | INT8 | VLLM | 通用GPU |
+| `fp8` | FP8 | VLLM | H100/A100 GPU |
+| `int8-torchao` | INT8 | TorchAO | 兼容性优先 |
+| `int8-q8f` | INT8 | Q8F | 高性能推理 |
+| `fp8-q8f` | FP8 | Q8F | 高性能推理 |
+
+#### T5量化特性
+
+- **线性层量化**：量化注意力层和FFN层中的线性变换
+- **动态量化**：激活在推理过程中动态量化，无需预计算
+- **精度保持**：通过对称量化和缩放因子保持数值精度
+
+### CLIP 编码器量化
+
+CLIP编码器支持与T5相同的量化方案：
+
+#### CLIP量化特性
+
+- **视觉编码器量化**：量化Vision Transformer中的线性层
+- **文本编码器量化**：量化文本编码器中的线性层
+- **多模态对齐**：保持视觉和文本特征之间的对齐精度
+
+## 🚀 生产量化模型
+
+可通过[LightX2V 官方模型仓库](https://huggingface.co/lightx2v)下载量化模型，具体可参考[模型结构文档](../deploy_guides/model_structure.md)。

 使用LightX2V的convert工具，将模型转换成量化模型，参考[文档](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme_zh.md)。

-## 加载量化模型进行推理
+## 📥 加载量化模型进行推理
+
+### DIT 模型配置

 将转换后的量化权重的路径，写到[配置文件](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization)中的`dit_quantized_ckpt`中。

-通过指定--config_json到具体的config文件，即可以加载量化模型进行推理
+```json
+{
+    "dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    }
+}
+```
+
+### T5 编码器配置
+
+```json
+{
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8",
+    "t5_quantized_ckpt": "/path/to/t5_quantized_ckpt"
+}
+```
+
+### CLIP 编码器配置
+
+```json
+{
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8",
+    "clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
+}
+```
+
+### 完整配置示例
+
+```json
+{
+    "dit_quantized_ckpt": "/path/to/dit_quantized_ckpt",
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8",
+    "t5_quantized_ckpt": "/path/to/t5_quantized_ckpt",
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8",
+    "clip_quantized_ckpt": "/path/to/clip_quantized_ckpt"
+}
+```
+
+通过指定`--config_json`到具体的config文件，即可以加载量化模型进行推理。

 [这里](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization)有一些运行脚本供使用。

-## 高阶量化功能
+## 💡 量化方案选择建议
+
+### 硬件兼容性
+
+- **H100/A100 GPU/RTX 4090/RTX 4060**：推荐使用 FP8 量化方案
+  - DIT: `W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm`
+  - T5/CLIP: `fp8`
+- **A100/RTX 3090/RTX 3060**：推荐使用 INT8 量化方案
+  - DIT: `W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm`
+  - T5/CLIP: `int8`
+- **其他 GPU**：根据硬件支持情况选择
+
+### 性能优化
+
+- **内存受限**：选择 INT8 量化方案
+- **速度优先**：选择 FP8 量化方案
+- **精度要求高**：使用 FP8 或混合精度方案
+
+### 混合量化策略
+
+可以针对不同组件选择不同的量化方案：
+
+```json
+{
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },
+    "t5_quantized": true,
+    "t5_quant_scheme": "int8",
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8"
+}
+```
+
+## 🔧 高阶量化功能
+
+### 量化算法调优
+
+具体可参考量化工具[LightCompress的文档](https://github.com/ModelTC/llmc/blob/main/docs/zh_cn/source/backend/lightx2v.md)
+
+### 自定义量化内核
+
+LightX2V支持自定义量化内核，可以通过以下方式扩展：
+
+1. **注册新的 mm_type**：在 `mm_weight.py` 中添加新的量化类
+2. **实现量化函数**：定义权重和激活的量化方法
+3. **集成计算内核**：使用自定义的矩阵乘法实现
+
+
+## 🚨 重要注意事项
+
+1. **硬件要求**：FP8 量化需要支持 FP8 的 GPU（如 H100、RTX40系）
+2. **精度影响**：量化会带来一定的精度损失，需要根据应用场景权衡
+3. **模型兼容性**：确保量化模型与推理代码版本兼容
+4. **内存管理**：量化模型加载时注意内存使用情况
+5. **量化校准**：建议使用代表性数据集进行量化校准以获得最佳效果
+
+## 📚 相关资源

-具体可参考量化工具[LLMC的文档](https://github.com/ModelTC/llmc/blob/main/docs/zh_cn/source/backend/lightx2v.md)
+- [量化工具文档](https://github.com/ModelTC/lightx2v/tree/main/tools/convert/readme_zh.md)
+- [运行脚本](https://github.com/ModelTC/lightx2v/tree/main/scripts/quantization)
+- [配置文件示例](https://github.com/ModelTC/lightx2v/blob/main/configs/quantization)
+- [LightCompress 量化文档](https://github.com/ModelTC/llmc/blob/main/docs/zh_cn/source/backend/lightx2v.md)
--- a/lightx2v/models/networks/wan/model.py
+++ b/lightx2v/models/networks/wan/model.py
@@ -48,6 +48,12 @@ class WanModel:
            assert not self.config.get("lazy_load", False)

        self.config.dit_quantized_ckpt = self.dit_quantized_ckpt
+        quant_config_path = os.path.join(self.config.dit_quantized_ckpt, "config.json")
+        if os.path.exists(quant_config_path):
+            with open(quant_config_path, "r") as f:
+                quant_model_config = json.load(f)
+            self.config.update(quant_model_config)
+
        self.weight_auto_quant = self.config.mm_config.get("weight_auto_quant", False)
        if self.dit_quantized:
            assert self.weight_auto_quant or self.dit_quantized_ckpt is not None

--- a/lightx2v/models/runners/wan/wan_runner.py
+++ b/lightx2v/models/runners/wan/wan_runner.py
@@ -94,18 +94,20 @@ class WanRunner(DefaultRunner):
            t5_model_name = f"models_t5_umt5-xxl-enc-{tmp_t5_quant_scheme}.pth"
            t5_quantized_ckpt = find_torch_model_path(self.config, "t5_quantized_ckpt", t5_model_name, tmp_t5_quant_scheme)
            t5_original_ckpt = None
+            tokenizer_path = os.path.join(os.path.dirname(t5_quantized_ckpt), "google/umt5-xxl")
        else:
            t5_quant_scheme = None
            t5_quantized_ckpt = None
            t5_model_name = "models_t5_umt5-xxl-enc-bf16.pth"
            t5_original_ckpt = find_torch_model_path(self.config, "t5_original_ckpt", t5_model_name, "original")
+            tokenizer_path = os.path.join(os.path.dirname(t5_original_ckpt), "google/umt5-xxl")

        text_encoder = T5EncoderModel(
            text_len=self.config["text_len"],
            dtype=torch.bfloat16,
            device=t5_device,
            checkpoint_path=t5_original_ckpt,
-            tokenizer_path=os.path.join(self.config.model_path, "google/umt5-xxl"),
+            tokenizer_path=tokenizer_path,
            shard_fn=None,
            cpu_offload=t5_offload,
            offload_granularity=self.config.get("t5_offload_granularity", "model"),
@@ -118,7 +120,7 @@ class WanRunner(DefaultRunner):

    def load_vae_encoder(self):
        vae_config = {
-            "vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth", "original"),
+            "vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth"),
            "device": self.init_device,
            "parallel": self.config.parallel_vae,
            "use_tiling": self.config.get("use_tiling_vae", False),
@@ -130,13 +132,13 @@ class WanRunner(DefaultRunner):

    def load_vae_decoder(self):
        vae_config = {
-            "vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth", "original"),
+            "vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.1_VAE.pth"),
            "device": self.init_device,
            "parallel": self.config.parallel_vae,
            "use_tiling": self.config.get("use_tiling_vae", False),
        }
        if self.config.get("use_tiny_vae", False):
-            tiny_vae_path = find_torch_model_path(self.config, "tiny_vae_path", "taew2_1.pth", "original")
+            tiny_vae_path = find_torch_model_path(self.config, "tiny_vae_path", "taew2_1.pth")
            vae_decoder = WanVAE_tiny(
                vae_pth=tiny_vae_path,
                device=self.init_device,

--- a/lightx2v/utils/set_config.py
+++ b/lightx2v/utils/set_config.py
@@ -37,7 +37,11 @@ def set_config(args):
        with open(os.path.join(config.model_path, "config.json"), "r") as f:
            model_config = json.load(f)
        config.update(model_config)
-
+    elif os.path.exists(os.path.join(config.model_path, "original", "config.json")):
+        with open(os.path.join(config.model_path, "original", "config.json"), "r") as f:
+            model_config = json.load(f)
+        config.update(model_config)
+    # load quantized config
    if config.get("dit_quantized_ckpt", None) is not None:
        config_path = os.path.join(config.dit_quantized_ckpt, "config.json")
        if os.path.exists(config_path):

--- a/lightx2v/utils/utils.py
+++ b/lightx2v/utils/utils.py
@@ -256,15 +256,19 @@ def save_to_video(
        raise ValueError(f"Unknown save method: {method}")


-def find_torch_model_path(config, ckpt_config_key=None, filename=None, subdir=None):
+def find_torch_model_path(config, ckpt_config_key=None, filename=None, subdir=["original", "fp8", "int8"]):
    if ckpt_config_key and config.get(ckpt_config_key, None) is not None:
        return config.get(ckpt_config_key)

    paths_to_check = [
        os.path.join(config.model_path, filename),
    ]
-    if subdir:
+    if isinstance(subdir, list):
+        for sub in subdir:
+            paths_to_check.append(os.path.join(config.model_path, sub, filename))
+    else:
        paths_to_check.append(os.path.join(config.model_path, subdir, filename))
+
    for path in paths_to_check:
        if os.path.exists(path):
            logger.info(f"Found PyTorch model checkpoint: {path}")
@@ -272,12 +276,15 @@ def find_torch_model_path(config, ckpt_config_key=None, filename=None, subdir=No
    raise FileNotFoundError(f"PyTorch model file '{filename}' not found.\nPlease download the model from https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")


-def find_hf_model_path(config, ckpt_config_key=None, subdir=None):
+def find_hf_model_path(config, ckpt_config_key=None, subdir=["original", "fp8", "int8"]):
    if ckpt_config_key and config.get(ckpt_config_key, None) is not None:
        return config.get(ckpt_config_key)

    paths_to_check = [config.model_path]
-    if subdir:
+    if isinstance(subdir, list):
+        for sub in subdir:
+            paths_to_check.append(os.path.join(config.model_path, sub))
+    else:
        paths_to_check.append(os.path.join(config.model_path, subdir))

    for path in paths_to_check: