Update gradio and offload

92539ed8 · gushiqiao · 8e941d39 · 92539ed8 · 92539ed8 · 92539ed8
Commit 92539ed8 authored Jul 12, 2025 by gushiqiao
20 changed files
--- a/app/gradio_demo.py
+++ b/app/gradio_demo.py
@@ -109,6 +109,24 @@ def get_cpu_memory():
    return available_bytes / 1024**3


+def cleanup_memory():
+    gc.collect()
+
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    try:
+        if hasattr(psutil, "virtual_memory"):
+            if os.name == "posix":
+                try:
+                    os.system("sync")
+                except:  # noqa
+                    pass
+    except:  # noqa
+        pass
+
+
 def generate_unique_filename(base_dir="./saved_videos"):
    os.makedirs(base_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -147,7 +165,6 @@ for op_name, is_installed in available_attn_ops:


 def run_inference(
-    model_type,
    prompt,
    negative_prompt,
    save_video_path,
@@ -173,6 +190,8 @@ def run_inference(
    cpu_offload,
    offload_granularity,
    offload_ratio,
+    t5_cpu_offload,
+    unload_modules,
    t5_offload_granularity,
    attention_type,
    quant_op,
@@ -181,6 +200,8 @@ def run_inference(
    clean_cuda_cache,
    image_path=None,
 ):
+    cleanup_memory()
+
    quant_op = quant_op.split("(")[0].strip()
    attention_type = attention_type.split("(")[0].strip()

@@ -192,7 +213,7 @@ def run_inference(
            model_config = json.load(f)

    if task == "t2v":
-        if model_type == "Wan2.1 1.3B":
+        if model_size == "1.3b":
            # 1.3B
            coefficient = [
                [
@@ -287,6 +308,7 @@ def run_inference(

    needs_reinit = (
        lazy_load
+        or unload_modules
        or global_runner is None
        or current_config is None
        or cur_dit_quant_scheme is None
@@ -325,6 +347,8 @@ def run_inference(
        if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
            with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
                quant_model_config = json.load(f)
+        else:
+            quant_model_config = {}
    else:
        mm_type = "Default"
        dit_quantized_ckpt = None
@@ -355,6 +379,8 @@ def run_inference(
        "coefficients": coefficient[0] if use_ret_steps else coefficient[1],
        "use_ret_steps": use_ret_steps,
        "teacache_thresh": teacache_thresh,
+        "t5_cpu_offload": t5_cpu_offload,
+        "unload_modules": unload_modules,
        "t5_quantized": is_t5_quant,
        "t5_quantized_ckpt": t5_quant_ckpt,
        "t5_quant_scheme": t5_quant_scheme,
@@ -425,15 +451,25 @@ def run_inference(

    asyncio.run(runner.run_pipeline())

-    if lazy_load:
-        del runner
-        torch.cuda.empty_cache()
-        gc.collect()
+    del config, args, model_config, quant_model_config
+    if "dit_quantized_ckpt" in locals():
+        del dit_quantized_ckpt
+    if "t5_quant_ckpt" in locals():
+        del t5_quant_ckpt
+    if "clip_quant_ckpt" in locals():
+        del clip_quant_ckpt
+
+    cleanup_memory()

    return save_video_path


-def auto_configure(enable_auto_config, model_type, resolution):
+def handle_lazy_load_change(lazy_load_enabled):
+    """Handle lazy_load checkbox change to automatically enable unload_modules"""
+    return gr.update(value=lazy_load_enabled)
+
+
+def auto_configure(enable_auto_config, resolution):
    default_config = {
        "torch_compile_val": False,
        "lazy_load_val": False,
@@ -443,6 +479,8 @@ def auto_configure(enable_auto_config, model_type, resolution):
        "cpu_offload_val": False,
        "offload_granularity_val": "block",
        "offload_ratio_val": 1,
+        "t5_cpu_offload_val": False,
+        "unload_modules_val": False,
        "t5_offload_granularity_val": "model",
        "attention_type_val": attn_op_choices[0][1],
        "quant_op_val": quant_op_choices[0][1],
@@ -499,7 +537,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
    else:
        res = "480p"

-    if model_type in ["Wan2.1 14B"]:
+    if model_size == "14b":
        is_14b = True
    else:
        is_14b = False
@@ -507,13 +545,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
    if res == "720p" and is_14b:
        gpu_rules = [
            (80, {}),
-            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
-            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
-            (32, {"cpu_offload_val": True, "offload_ratio_val": 1}),
+            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
+            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
+            (32, {"cpu_offload_val": True, "offload_ratio_val": 1, "t5_cpu_offload_val": True}),
            (
                24,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -524,6 +563,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                16,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -537,6 +577,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                12,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -552,6 +593,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                8,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -564,6 +606,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                    "clip_quant_scheme_val": quant_type,
                    "dit_quant_scheme_val": quant_type,
                    "lazy_load_val": True,
+                    "unload_modules_val": True,
                    "use_tiny_vae_val": True,
                },
            ),
@@ -572,13 +615,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
    elif is_14b:
        gpu_rules = [
            (80, {}),
-            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.2}),
-            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
-            (24, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
+            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.2, "t5_cpu_offload_val": True}),
+            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
+            (24, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
            (
                16,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -591,6 +635,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                (
                    {
                        "cpu_offload_val": True,
+                        "t5_cpu_offload_val": True,
                        "offload_ratio_val": 1,
                        "t5_offload_granularity_val": "block",
                        "precision_mode_val": "bf16",
@@ -600,6 +645,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                        "clip_quant_scheme_val": quant_type,
                        "dit_quant_scheme_val": quant_type,
                        "lazy_load_val": True,
+                        "unload_modules_val": True,
                        "rotary_chunk_val": True,
                        "rotary_chunk_size_val": 10000,
                        "use_tiny_vae_val": True,
@@ -607,6 +653,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                    if res == "540p"
                    else {
                        "cpu_offload_val": True,
+                        "t5_cpu_offload_val": True,
                        "offload_ratio_val": 1,
                        "t5_offload_granularity_val": "block",
                        "precision_mode_val": "bf16",
@@ -616,6 +663,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                        "clip_quant_scheme_val": quant_type,
                        "dit_quant_scheme_val": quant_type,
                        "lazy_load_val": True,
+                        "unload_modules_val": True,
                        "use_tiny_vae_val": True,
                    }
                ),
@@ -623,7 +671,17 @@ def auto_configure(enable_auto_config, model_type, resolution):
        ]

    else:
-        gpu_rules = {}
+        gpu_rules = [
+            (24, {}),
+            (
+                8,
+                {
+                    "t5_cpu_offload_val": True,
+                    "t5_offload_granularity_val": "block",
+                    "t5_quant_scheme_val": quant_type,
+                },
+            ),
+        ]

    if is_14b:
        cpu_rules = [
@@ -637,11 +695,22 @@ def auto_configure(enable_auto_config, model_type, resolution):
                    "t5_quant_scheme_val": quant_type,
                    "clip_quant_scheme_val": quant_type,
                    "lazy_load_val": True,
+                    "unload_modules_val": True,
                },
            ),
        ]
    else:
-        cpu_rules = {}
+        cpu_rules = [
+            (64, {}),
+            (
+                16,
+                {
+                    "t5_quant_scheme_val": quant_type,
+                    "unload_modules_val": True,
+                    "use_tiny_vae_val": True,
+                },
+            ),
+        ]

    for threshold, updates in gpu_rules:
        if gpu_memory >= threshold:
@@ -680,20 +749,6 @@ def main():
                        with gr.Group():
                            gr.Markdown("## 📥 Input Parameters")

-                            with gr.Row():
-                                if task == "i2v":
-                                    model_type = gr.Dropdown(
-                                        choices=["Wan2.1 14B"],
-                                        value="Wan2.1 14B",
-                                        label="Model Type",
-                                    )
-                                else:
-                                    model_type = gr.Dropdown(
-                                        choices=["Wan2.1 14B", "Wan2.1 1.3B"],
-                                        value="Wan2.1 14B",
-                                        label="Model Type",
-                                    )
-
                            if task == "i2v":
                                with gr.Row():
                                    image_path = gr.Image(
@@ -849,6 +904,11 @@ def main():
                            info="Controls the chunk size for applying rotary embeddings. Larger values may improve performance but increase memory usage. Only effective if 'rotary_chunk' is checked.",
                        )

+                        unload_modules = gr.Checkbox(
+                            label="Unload Modules",
+                            value=False,
+                            info="Unload modules (T5, CLIP, DIT, etc.) after inference to reduce GPU/CPU memory usage",
+                        )
                        clean_cuda_cache = gr.Checkbox(
                            label="Clean CUDA Memory Cache",
                            value=False,
@@ -883,6 +943,12 @@ def main():
                            value=1.0,
                            info="Controls how much of the Dit model is offloaded to the CPU",
                        )
+                        t5_cpu_offload = gr.Checkbox(
+                            label="T5 CPU Offloading",
+                            value=False,
+                            info="Offload the T5 Encoder model to CPU to reduce GPU memory usage",
+                        )
+
                        t5_offload_granularity = gr.Dropdown(
                            label="T5 Encoder Offload Granularity",
                            choices=["model", "block"],
@@ -971,7 +1037,7 @@ def main():

                enable_auto_config.change(
                    fn=auto_configure,
-                    inputs=[enable_auto_config, model_type, resolution],
+                    inputs=[enable_auto_config, resolution],
                    outputs=[
                        torch_compile,
                        lazy_load,
@@ -981,6 +1047,8 @@ def main():
                        cpu_offload,
                        offload_granularity,
                        offload_ratio,
+                        t5_cpu_offload,
+                        unload_modules,
                        t5_offload_granularity,
                        attention_type,
                        quant_op,
@@ -995,11 +1063,16 @@ def main():
                        use_ret_steps,
                    ],
                )
+
+                lazy_load.change(
+                    fn=handle_lazy_load_change,
+                    inputs=[lazy_load],
+                    outputs=[unload_modules],
+                )
        if task == "i2v":
            infer_btn.click(
                fn=run_inference,
                inputs=[
-                    model_type,
                    prompt,
                    negative_prompt,
                    save_video_path,
@@ -1025,6 +1098,8 @@ def main():
                    cpu_offload,
                    offload_granularity,
                    offload_ratio,
+                    t5_cpu_offload,
+                    unload_modules,
                    t5_offload_granularity,
                    attention_type,
                    quant_op,
@@ -1039,7 +1114,6 @@ def main():
            infer_btn.click(
                fn=run_inference,
                inputs=[
-                    model_type,
                    prompt,
                    negative_prompt,
                    save_video_path,
@@ -1065,6 +1139,8 @@ def main():
                    cpu_offload,
                    offload_granularity,
                    offload_ratio,
+                    t5_cpu_offload,
+                    unload_modules,
                    t5_offload_granularity,
                    attention_type,
                    quant_op,
@@ -1088,14 +1164,16 @@ if __name__ == "__main__":
        default="wan2.1",
        help="Model class to use",
    )
+    parser.add_argument("--model_size", type=str, required=True, choices=["14b", "1.3b"], help="Model type to use")
    parser.add_argument("--task", type=str, required=True, choices=["i2v", "t2v"], help="Specify the task type. 'i2v' for image-to-video translation, 't2v' for text-to-video generation.")
    parser.add_argument("--server_port", type=int, default=7862, help="Server port")
    parser.add_argument("--server_name", type=str, default="0.0.0.0", help="Server ip")
    args = parser.parse_args()

-    global model_path, model_cls
+    global model_path, model_cls, model_size
    model_path = args.model_path
    model_cls = args.model_cls
+    model_size = args.model_size
    task = args.task

    main()
--- a/app/gradio_demo_zh.py
+++ b/app/gradio_demo_zh.py
@@ -109,6 +109,26 @@ def get_cpu_memory():
    return available_bytes / 1024**3


+def cleanup_memory():
+    gc.collect()
+
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    try:
+        import psutil
+
+        if hasattr(psutil, "virtual_memory"):
+            if os.name == "posix":
+                try:
+                    os.system("sync")
+                except:  # noqa
+                    pass
+    except:  # noqa
+        pass
+
+
 def generate_unique_filename(base_dir="./saved_videos"):
    os.makedirs(base_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -147,7 +167,6 @@ for op_name, is_installed in available_attn_ops:


 def run_inference(
-    model_type,
    prompt,
    negative_prompt,
    save_video_path,
@@ -173,6 +192,8 @@ def run_inference(
    cpu_offload,
    offload_granularity,
    offload_ratio,
+    t5_cpu_offload,
+    unload_modules,
    t5_offload_granularity,
    attention_type,
    quant_op,
@@ -181,6 +202,8 @@ def run_inference(
    clean_cuda_cache,
    image_path=None,
 ):
+    cleanup_memory()
+
    quant_op = quant_op.split("(")[0].strip()
    attention_type = attention_type.split("(")[0].strip()

@@ -192,7 +215,7 @@ def run_inference(
            model_config = json.load(f)

    if task == "t2v":
-        if model_type == "Wan2.1 1.3B":
+        if model_size == "1.3b":
            # 1.3B
            coefficient = [
                [
@@ -287,6 +310,7 @@ def run_inference(

    needs_reinit = (
        lazy_load
+        or unload_modules
        or global_runner is None
        or current_config is None
        or cur_dit_quant_scheme is None
@@ -325,6 +349,8 @@ def run_inference(
        if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
            with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
                quant_model_config = json.load(f)
+        else:
+            quant_model_config = {}
    else:
        mm_type = "Default"
        dit_quantized_ckpt = None
@@ -355,6 +381,8 @@ def run_inference(
        "coefficients": coefficient[0] if use_ret_steps else coefficient[1],
        "use_ret_steps": use_ret_steps,
        "teacache_thresh": teacache_thresh,
+        "t5_cpu_offload": t5_cpu_offload,
+        "unload_modules": unload_modules,
        "t5_quantized": is_t5_quant,
        "t5_quantized_ckpt": t5_quant_ckpt,
        "t5_quant_scheme": t5_quant_scheme,
@@ -425,15 +453,25 @@ def run_inference(

    asyncio.run(runner.run_pipeline())

-    if lazy_load:
-        del runner
-        torch.cuda.empty_cache()
-        gc.collect()
+    del config, args, model_config, quant_model_config
+    if "dit_quantized_ckpt" in locals():
+        del dit_quantized_ckpt
+    if "t5_quant_ckpt" in locals():
+        del t5_quant_ckpt
+    if "clip_quant_ckpt" in locals():
+        del clip_quant_ckpt
+
+    cleanup_memory()

    return save_video_path


-def auto_configure(enable_auto_config, model_type, resolution):
+def handle_lazy_load_change(lazy_load_enabled):
+    """Handle lazy_load checkbox change to automatically enable unload_modules"""
+    return gr.update(value=lazy_load_enabled)
+
+
+def auto_configure(enable_auto_config, resolution):
    default_config = {
        "torch_compile_val": False,
        "lazy_load_val": False,
@@ -443,6 +481,8 @@ def auto_configure(enable_auto_config, model_type, resolution):
        "cpu_offload_val": False,
        "offload_granularity_val": "block",
        "offload_ratio_val": 1,
+        "t5_cpu_offload_val": False,
+        "unload_modules_val": False,
        "t5_offload_granularity_val": "model",
        "attention_type_val": attn_op_choices[0][1],
        "quant_op_val": quant_op_choices[0][1],
@@ -499,7 +539,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
    else:
        res = "480p"

-    if model_type in ["Wan2.1 14B"]:
+    if model_size == "14b":
        is_14b = True
    else:
        is_14b = False
@@ -507,13 +547,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
    if res == "720p" and is_14b:
        gpu_rules = [
            (80, {}),
-            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
-            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
-            (32, {"cpu_offload_val": True, "offload_ratio_val": 1}),
+            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
+            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
+            (32, {"cpu_offload_val": True, "offload_ratio_val": 1, "t5_cpu_offload_val": True}),
            (
                24,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -524,6 +565,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                16,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -537,6 +579,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                12,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -552,6 +595,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                8,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -564,6 +608,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                    "clip_quant_scheme_val": quant_type,
                    "dit_quant_scheme_val": quant_type,
                    "lazy_load_val": True,
+                    "unload_modules_val": True,
                    "use_tiny_vae_val": True,
                },
            ),
@@ -572,13 +617,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
    elif is_14b:
        gpu_rules = [
            (80, {}),
-            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.2}),
-            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
-            (24, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
+            (48, {"cpu_offload_val": True, "offload_ratio_val": 0.2, "t5_cpu_offload_val": True}),
+            (40, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
+            (24, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
            (
                16,
                {
                    "cpu_offload_val": True,
+                    "t5_cpu_offload_val": True,
                    "offload_ratio_val": 1,
                    "t5_offload_granularity_val": "block",
                    "precision_mode_val": "bf16",
@@ -591,6 +637,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                (
                    {
                        "cpu_offload_val": True,
+                        "t5_cpu_offload_val": True,
                        "offload_ratio_val": 1,
                        "t5_offload_granularity_val": "block",
                        "precision_mode_val": "bf16",
@@ -600,6 +647,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                        "clip_quant_scheme_val": quant_type,
                        "dit_quant_scheme_val": quant_type,
                        "lazy_load_val": True,
+                        "unload_modules_val": True,
                        "rotary_chunk_val": True,
                        "rotary_chunk_size_val": 10000,
                        "use_tiny_vae_val": True,
@@ -607,6 +655,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                    if res == "540p"
                    else {
                        "cpu_offload_val": True,
+                        "t5_cpu_offload_val": True,
                        "offload_ratio_val": 1,
                        "t5_offload_granularity_val": "block",
                        "precision_mode_val": "bf16",
@@ -616,6 +665,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
                        "clip_quant_scheme_val": quant_type,
                        "dit_quant_scheme_val": quant_type,
                        "lazy_load_val": True,
+                        "unload_modules_val": True,
                        "use_tiny_vae_val": True,
                    }
                ),
@@ -623,7 +673,17 @@ def auto_configure(enable_auto_config, model_type, resolution):
        ]

    else:
-        gpu_rules = {}
+        gpu_rules = [
+            (24, {}),
+            (
+                8,
+                {
+                    "t5_cpu_offload_val": True,
+                    "t5_offload_granularity_val": "block",
+                    "t5_quant_scheme_val": quant_type,
+                },
+            ),
+        ]

    if is_14b:
        cpu_rules = [
@@ -637,11 +697,22 @@ def auto_configure(enable_auto_config, model_type, resolution):
                    "t5_quant_scheme_val": quant_type,
                    "clip_quant_scheme_val": quant_type,
                    "lazy_load_val": True,
+                    "unload_modules_val": True,
                },
            ),
        ]
    else:
-        cpu_rules = {}
+        cpu_rules = [
+            (64, {}),
+            (
+                16,
+                {
+                    "t5_quant_scheme_val": quant_type,
+                    "unload_modules_val": True,
+                    "use_tiny_vae_val": True,
+                },
+            ),
+        ]

    for threshold, updates in gpu_rules:
        if gpu_memory >= threshold:
@@ -680,20 +751,6 @@ def main():
                        with gr.Group():
                            gr.Markdown("## 📥 输入参数")

-                            with gr.Row():
-                                if task == "i2v":
-                                    model_type = gr.Dropdown(
-                                        choices=["Wan2.1 14B"],
-                                        value="Wan2.1 14B",
-                                        label="模型类型",
-                                    )
-                                else:
-                                    model_type = gr.Dropdown(
-                                        choices=["Wan2.1 14B", "Wan2.1 1.3B"],
-                                        value="Wan2.1 14B",
-                                        label="模型类型",
-                                    )
-
                            if task == "i2v":
                                with gr.Row():
                                    image_path = gr.Image(
@@ -846,7 +903,11 @@ def main():
                            step=100,
                            info="控制应用旋转编码的块大小。较大的值可能提高性能但增加内存使用。仅在'rotary_chunk'勾选时有效。",
                        )
-
+                        unload_modules = gr.Checkbox(
+                            label="卸载模块",
+                            value=False,
+                            info="推理后卸载模块（T5、CLIP、DIT等）以减少GPU/CPU内存使用",
+                        )
                        clean_cuda_cache = gr.Checkbox(
                            label="清理CUDA内存缓存",
                            value=False,
@@ -881,6 +942,11 @@ def main():
                            value=1.0,
                            info="控制将多少Dit模型卸载到CPU",
                        )
+                        t5_cpu_offload = gr.Checkbox(
+                            label="T5 CPU卸载",
+                            value=False,
+                            info="将T5编码器模型卸载到CPU以减少GPU内存使用",
+                        )
                        t5_offload_granularity = gr.Dropdown(
                            label="T5编码器卸载粒度",
                            choices=["model", "block"],
@@ -969,7 +1035,7 @@ def main():

                enable_auto_config.change(
                    fn=auto_configure,
-                    inputs=[enable_auto_config, model_type, resolution],
+                    inputs=[enable_auto_config, resolution],
                    outputs=[
                        torch_compile,
                        lazy_load,
@@ -979,6 +1045,8 @@ def main():
                        cpu_offload,
                        offload_granularity,
                        offload_ratio,
+                        t5_cpu_offload,
+                        unload_modules,
                        t5_offload_granularity,
                        attention_type,
                        quant_op,
@@ -993,11 +1061,16 @@ def main():
                        use_ret_steps,
                    ],
                )
+
+                lazy_load.change(
+                    fn=handle_lazy_load_change,
+                    inputs=[lazy_load],
+                    outputs=[unload_modules],
+                )
        if task == "i2v":
            infer_btn.click(
                fn=run_inference,
                inputs=[
-                    model_type,
                    prompt,
                    negative_prompt,
                    save_video_path,
@@ -1023,6 +1096,8 @@ def main():
                    cpu_offload,
                    offload_granularity,
                    offload_ratio,
+                    t5_cpu_offload,
+                    unload_modules,
                    t5_offload_granularity,
                    attention_type,
                    quant_op,
@@ -1037,7 +1112,6 @@ def main():
            infer_btn.click(
                fn=run_inference,
                inputs=[
-                    model_type,
                    prompt,
                    negative_prompt,
                    save_video_path,
@@ -1063,6 +1137,8 @@ def main():
                    cpu_offload,
                    offload_granularity,
                    offload_ratio,
+                    t5_cpu_offload,
+                    unload_modules,
                    t5_offload_granularity,
                    attention_type,
                    quant_op,
@@ -1086,14 +1162,16 @@ if __name__ == "__main__":
        default="wan2.1",
        help="要使用的模型类别",
    )
+    parser.add_argument("--model_size", type=str, required=True, choices=["14b", "1.3b"], help="模型大小：14b 或 1.3b")
    parser.add_argument("--task", type=str, required=True, choices=["i2v", "t2v"], help="指定任务类型。'i2v'用于图像到视频转换，'t2v'用于文本到视频生成。")
    parser.add_argument("--server_port", type=int, default=7862, help="服务器端口")
    parser.add_argument("--server_name", type=str, default="0.0.0.0", help="服务器IP")
    args = parser.parse_args()

-    global model_path, model_cls
+    global model_path, model_cls, model_size
    model_path = args.model_path
    model_cls = args.model_cls
+    model_size = args.model_size
    task = args.task

    main()
--- a/app/run_gradio.sh
+++ b/app/run_gradio.sh
@@ -15,16 +15,19 @@
 # Lightx2v project root directory path
 # Example: /home/user/lightx2v or /data/video_gen/lightx2v
 lightx2v_path=/path/to/lightx2v
-
 # Model path configuration
 # Image-to-video model path (for i2v tasks)
 # Example: /path/to/Wan2.1-I2V-14B-720P-Lightx2v
-i2v_model_path=/path/to/Wan2.1-I2V-14B-720P-Lightx2v
+i2v_model_path=/path/to/Wan2.1-I2V-14B-720P-Lightx2v-Step-Distill

 # Text-to-video model path (for t2v tasks)
 # Example: /path/to/Wan2.1-T2V-1.3B
 t2v_model_path=/path/to/Wan2.1-T2V-1.3B

+# Model size configuration
+# Default model size (14b, 1.3b)
+model_size="14b"
+
 # Server configuration
 server_name="0.0.0.0"
 server_port=8032
@@ -65,6 +68,10 @@ while [[ $# -gt 0 ]]; do
            export CUDA_VISIBLE_DEVICES=$gpu_id
            shift 2
            ;;
+        --model_size)
+            model_size="$2"
+            shift 2
+            ;;
        --help)
            echo "🎬 Lightx2v Gradio Demo Startup Script"
            echo "=========================================="
@@ -79,6 +86,10 @@ while [[ $# -gt 0 ]]; do
            echo "                     en: English interface"
            echo "  --port PORT       Server port (default: 8032)"
            echo "  --gpu GPU_ID      GPU device ID (default: 0)"
+            echo "  --model_size MODEL_SIZE"
+            echo "                     Model size (default: 14b)"
+            echo "                     14b: 14 billion parameters model"
+            echo "                     1.3b: 1.3 billion parameters model"
            echo "  --help            Show this help message"
            echo ""
            echo "🚀 Usage examples:"
@@ -86,6 +97,8 @@ while [[ $# -gt 0 ]]; do
            echo "  $0 --task i2v --lang zh --port 8032   # Start with specified parameters"
            echo "  $0 --task t2v --lang en --port 7860   # Text-to-video with English interface"
            echo "  $0 --task i2v --gpu 1 --port 8032     # Use GPU 1"
+            echo "  $0 --task t2v --model_size 1.3b       # Use 1.3B model"
+            echo "  $0 --task i2v --model_size 14b        # Use 14B model"
            echo ""
            echo "📝 Notes:"
            echo "  - Edit script to configure model paths before first use"
@@ -113,6 +126,12 @@ if [[ "$lang" != "zh" && "$lang" != "en" ]]; then
    exit 1
 fi

+# Validate model size
+if [[ "$model_size" != "14b" && "$model_size" != "1.3b" ]]; then
+    echo "Error: Model size must be '14b' or '1.3b'"
+    exit 1
+fi
+
 # Select model path based on task type
 if [[ "$task" == "i2v" ]]; then
    model_path=$i2v_model_path
@@ -161,6 +180,7 @@ echo "=========================================="
 echo "📁 Project path: $lightx2v_path"
 echo "🤖 Model path: $model_path"
 echo "🎯 Task type: $task"
+echo "🤖 Model size: $model_size"
 echo "🌏 Interface language: $lang"
 echo "🖥️  GPU device: $gpu_id"
 echo "🌐 Server address: $server_name:$server_port"
@@ -190,7 +210,8 @@ python $demo_file \
    --model_path "$model_path" \
    --task "$task" \
    --server_name "$server_name" \
-    --server_port "$server_port"
+    --server_port "$server_port" \
+    --model_size "$model_size"

 # Display final system resource usage
 echo ""

--- a/configs/offload/block/wan_i2v_block.json
+++ b/configs/offload/block/wan_i2v_block.json
@@ -11,6 +11,7 @@
    "sample_shift": 5,
    "enable_cfg": true,
    "cpu_offload": true,
+    "t5_cpu_offload": true,
    "offload_granularity": "block",
    "mm_config": {
        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",

--- a/configs/offload/block/wan_t2v_1_3b.json
+++ b/configs/offload/block/wan_t2v_1_3b.json
+{
+    "infer_steps": 4,
+    "target_video_length": 81,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": true,
+    "t5_cpu_offload": true,
+    "t5_offload_granularity": "block",
+    "t5_quantized": true,
+    "t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
+    "t5_quant_scheme": "fp8",
+    "unload_modules": true,
+    "use_tiling_vae": true
+}
--- a/configs/offload/block/wan_t2v_block.json
+++ b/configs/offload/block/wan_t2v_block.json
@@ -13,6 +13,7 @@
    "enable_cfg": true,
    "cpu_offload": true,
    "offload_granularity": "block",
+    "t5_cpu_offload": true,
    "mm_config": {
        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
        "weight_auto_quant": true

--- a/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json
+++ b/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json
@@ -18,6 +18,7 @@
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm",
        "weight_auto_quant": false
    },
+    "t5_cpu_offload": true,
    "t5_quantized": true,
    "t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
    "t5_quant_scheme": "fp8",

--- a/configs/offload/disk/wan_i2v_phase_lazy_load_720p.json
+++ b/configs/offload/disk/wan_i2v_phase_lazy_load_720p.json
@@ -18,6 +18,7 @@
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm",
        "weight_auto_quant": false
    },
+    "t5_cpu_offload": true,
    "t5_quantized": true,
    "t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
    "t5_quant_scheme": "fp8",

--- a/configs/offload/phase/wan_i2v_phase.json
+++ b/configs/offload/phase/wan_i2v_phase.json
@@ -12,6 +12,7 @@
    "enable_cfg": true,
    "cpu_offload": true,
    "offload_granularity": "phase",
+    "t5_cpu_offload": true,
    "t5_offload_granularity": "block",
    "dit_quantized_ckpt": "/path/to/dit_int8",
    "mm_config": {

--- a/configs/offload/phase/wan_t2v_phase.json
+++ b/configs/offload/phase/wan_t2v_phase.json
@@ -12,6 +12,7 @@
    "sample_shift": 8,
    "enable_cfg": true,
    "cpu_offload": true,
+    "t5_cpu_offload": true,
    "offload_granularity": "phase",
    "dit_quantized_ckpt": "/path/to/dit_int8",
    "mm_config": {

--- a/docs/EN/source/deploy_guides/deploy_gradio.md
+++ b/docs/EN/source/deploy_guides/deploy_gradio.md
@@ -15,7 +15,7 @@ This project contains two main demo files:
 - Python 3.10+ (recommended)
 - CUDA 12.4+ (recommended)
 - At least 8GB GPU VRAM
- At least 16GB system memory
+- At least 16GB system memory (preferably at least 32GB)
 - At least 128GB SSD solid-state drive (**💾 Strongly recommend using SSD solid-state drives to store model files! During "lazy loading" startup, significantly improves model loading speed and inference performance**)

 ### Install Dependencies
@@ -80,8 +80,9 @@ vim run_gradio.sh
 bash run_gradio.sh

 # 3. Or start with parameters (recommended)
-bash run_gradio.sh --task i2v --lang en --port 8032
-# bash run_gradio.sh --task t2v --lang en --port 8032
+bash run_gradio.sh --task i2v --lang en --model_size 14b --port 8032
+# bash run_gradio.sh --task i2v --lang en --model_size 14b --port 8032
+# bash run_gradio.sh --task i2v --lang en --model_size 1.3b --port 8032
 ```

 #### Method 2: Direct Command Line Startup
@@ -90,6 +91,7 @@ bash run_gradio.sh --task i2v --lang en --port 8032
 ```bash
 python gradio_demo.py \
    --model_path /path/to/Wan2.1-I2V-14B-720P-Lightx2v \
+    --model_size 14b \
    --task i2v \
    --server_name 0.0.0.0 \
    --server_port 7862
@@ -99,6 +101,7 @@ python gradio_demo.py \
 ```bash
 python gradio_demo.py \
    --model_path /path/to/Wan2.1-T2V-1.3B \
+    --model_size 1.3b \
    --task t2v \
    --server_name 0.0.0.0 \
    --server_port 7862
@@ -108,6 +111,7 @@ python gradio_demo.py \
 ```bash
 python gradio_demo_zh.py \
    --model_path /path/to/model \
+    --model_size 14b \
    --task i2v \
    --server_name 0.0.0.0 \
    --server_port 7862
@@ -119,6 +123,7 @@ python gradio_demo_zh.py \
 |-----------|------|----------|---------|-------------|
 | `--model_path` | str | ✅ | - | Model folder path |
 | `--model_cls` | str | ❌ | wan2.1 | Model class (currently only supports wan2.1) |
+| `--model_size` | str | ✅ | - | Model size: `14b(t2v or i2v)` or `1.3b(t2v)` |
 | `--task` | str | ✅ | - | Task type: `i2v` (image-to-video) or `t2v` (text-to-video) |
 | `--server_port` | int | ❌ | 7862 | Server port |
 | `--server_name` | str | ❌ | 0.0.0.0 | Server IP address |
@@ -127,10 +132,6 @@ python gradio_demo_zh.py \

 ### Basic Settings

-#### Model Type Selection
- **Wan2.1 14B**: Large parameter count, high generation quality, suitable for high-quality video generation
- **Wan2.1 1.3B**: Lightweight model, fast speed, suitable for rapid prototyping and testing
-
 #### Input Parameters
 - **Prompt**: Describe the expected video content
 - **Negative Prompt**: Specify elements you don't want to appear
@@ -217,7 +218,7 @@ lightx2v/app/
 ## 🎨 Interface Description

 ### Basic Settings Tab
- **Input Parameters**: Model type, prompts, resolution, and other basic settings
+- **Input Parameters**: Prompts, resolution, and other basic settings
 - **Video Parameters**: FPS, frame count, CFG, and other video generation parameters
 - **Output Settings**: Video save path configuration


--- a/docs/EN/source/method_tutorials/offload.md
+++ b/docs/EN/source/method_tutorials/offload.md
--- a/docs/ZH_CN/source/deploy_guides/deploy_gradio.md
+++ b/docs/ZH_CN/source/deploy_guides/deploy_gradio.md
@@ -15,7 +15,7 @@ Lightx2v 是一个轻量级的视频推理和生成引擎，提供了基于 Grad
 - Python 3.10+ (推荐)
 - CUDA 12.4+ (推荐)
 - 至少 8GB GPU 显存
- 至少 16GB 系统内存
+- 至少 16GB 系统内存（最好最少有 32G）
 - 至少 128GB SSD固态硬盘 (**💾 强烈建议使用SSD固态硬盘存储模型文件！"延迟加载"启动时，显著提升模型加载速度和推理性能**)


@@ -83,8 +83,9 @@ vim run_gradio.sh
 bash run_gradio.sh

 # 3. 或使用参数启动（推荐）
-bash run_gradio.sh --task i2v --lang zh --port 8032
-# bash run_gradio.sh --task t2v --lang zh --port 8032
+bash run_gradio.sh --task i2v --lang zh --model_size 14b --port 8032
+# bash run_gradio.sh --task i2v --lang zh --model_size 14b --port 8032
+# bash run_gradio.sh --task i2v --lang zh --model_size 1.3b --port 8032
 ```

 #### 方式二：直接命令行启动
@@ -93,6 +94,7 @@ bash run_gradio.sh --task i2v --lang zh --port 8032
 ```bash
 python gradio_demo_zh.py \
    --model_path /path/to/Wan2.1-I2V-14B-720P-Lightx2v \
+    --model_size 14b \
    --task i2v \
    --server_name 0.0.0.0 \
    --server_port 7862
@@ -102,6 +104,7 @@ python gradio_demo_zh.py \
 ```bash
 python gradio_demo_zh.py \
    --model_path /path/to/Wan2.1-T2V-1.3B \
+    --model_size 1.3b \
    --task t2v \
    --server_name 0.0.0.0 \
    --server_port 7862
@@ -111,6 +114,7 @@ python gradio_demo_zh.py \
 ```bash
 python gradio_demo.py \
    --model_path /path/to/model \
+    --model_size 14b \
    --task i2v \
    --server_name 0.0.0.0 \
    --server_port 7862
@@ -122,6 +126,7 @@ python gradio_demo.py \
 |------|------|------|--------|------|
 | `--model_path` | str | ✅ | - | 模型文件夹路径 |
 | `--model_cls` | str | ❌ | wan2.1 | 模型类别（目前仅支持wan2.1） |
+| `--model_size` | str | ✅ | - | 模型大小：`14b（图像到视频或者文本到视频）` 或 `1.3b（文本到视频）` |
 | `--task` | str | ✅ | - | 任务类型：`i2v`（图像到视频）或 `t2v`（文本到视频） |
 | `--server_port` | int | ❌ | 7862 | 服务器端口 |
 | `--server_name` | str | ❌ | 0.0.0.0 | 服务器IP地址 |
@@ -130,10 +135,6 @@ python gradio_demo.py \

 ### 基本设置

-#### 模型类型选择
- **Wan2.1 14B**: 参数量大，生成质量高，适合高质量视频生成
- **Wan2.1 1.3B**: 轻量级模型，速度快，适合快速原型和测试
-
 #### 输入参数
 - **提示词 (Prompt)**: 描述期望的视频内容
 - **负向提示词 (Negative Prompt)**: 指定不希望出现的元素
@@ -221,7 +222,7 @@ lightx2v/app/
 ## 🎨 界面说明

 ### 基本设置标签页
- **输入参数**: 模型类型、提示词、分辨率等基本设置
+- **输入参数**: 提示词、分辨率等基本设置
 - **视频参数**: FPS、帧数、CFG等视频生成参数
 - **输出设置**: 视频保存路径配置


--- a/docs/ZH_CN/source/method_tutorials/offload.md
+++ b/docs/ZH_CN/source/method_tutorials/offload.md
--- a/lightx2v/common/offload/manager.py
+++ b/lightx2v/common/offload/manager.py
@@ -15,6 +15,7 @@ class WeightAsyncStreamManager(object):
        self.cuda_load_stream = torch.cuda.Stream(priority=0)
        self.offload_block_num = int(offload_ratio * blocks_num)
        self.phases_num = phases_num
+        self.block_nums = blocks_num
        self.offload_phases_num = blocks_num * phases_num * offload_ratio

    def prefetch_weights(self, block_idx, blocks_weights):
@@ -128,6 +129,9 @@ class LazyWeightAsyncStreamManager(WeightAsyncStreamManager):
        if next_block_idx < 0:
            next_block_idx = 0

+        if next_block_idx == self.block_nums:
+            return
+
        if self.offload_gra == "phase":
            for phase_idx in range(self.phases_num):
                obj_key = (next_block_idx, phase_idx)
@@ -170,6 +174,8 @@ class LazyWeightAsyncStreamManager(WeightAsyncStreamManager):
                self.pin_memory_buffer.push(block_idx, block)

            block_idx += 1
+            if block_idx == self.block_nums:
+                break

    def prefetch_weights_from_disk(self, blocks):
        if self.initial_prefetch_done:

--- a/lightx2v/common/ops/conv/conv2d.py
+++ b/lightx2v/common/ops/conv/conv2d.py
@@ -56,3 +56,10 @@ class Conv2dWeight(Conv2dWeightTemplate):
        if self.bias is not None:
            destination[self.bias_name] = self.bias.cpu().detach().clone()
        return destination
+
+    def clear(self):
+        attrs = ["weight", "bias"]
+        for attr in attrs:
+            if hasattr(self, attr):
+                delattr(self, attr)
+                setattr(self, attr, None)
--- a/lightx2v/common/ops/conv/conv3d.py
+++ b/lightx2v/common/ops/conv/conv3d.py
@@ -66,3 +66,10 @@ class Conv3dWeight(Conv3dWeightTemplate):
        if self.bias is not None:
            destination[self.bias_name] = self.bias.cpu().detach().clone()
        return destination
+
+    def clear(self):
+        attrs = ["weight", "bias"]
+        for attr in attrs:
+            if hasattr(self, attr):
+                delattr(self, attr)
+                setattr(self, attr, None)
--- a/lightx2v/common/ops/norm/layer_norm_weight.py
+++ b/lightx2v/common/ops/norm/layer_norm_weight.py
@@ -34,9 +34,11 @@ class LNWeightTemplate(metaclass=ABCMeta):
        return self.weight.numel() * self.weight.element_size()

    def clear(self):
-        del self.weight
-        if self.bias is not None:
-            del self.bias
+        attrs = ["weight", "bias"]
+        for attr in attrs:
+            if hasattr(self, attr):
+                delattr(self, attr)
+                setattr(self, attr, None)

    @abstractmethod
    def apply(self, input_tensor):

--- a/lightx2v/common/ops/norm/rms_norm_weight.py
+++ b/lightx2v/common/ops/norm/rms_norm_weight.py
@@ -23,7 +23,11 @@ class RMSWeightTemplate(metaclass=ABCMeta):
            self.pinned_weight = torch.empty(self.weight.shape, pin_memory=True, dtype=self.weight.dtype)

    def clear(self):
-        del self.weight
+        attrs = ["weight"]
+        for attr in attrs:
+            if hasattr(self, attr):
+                delattr(self, attr)
+                setattr(self, attr, None)

    @abstractmethod
    def apply(self, input_tensor):

--- a/lightx2v/common/ops/tensor/tensor.py
+++ b/lightx2v/common/ops/tensor/tensor.py
@@ -22,7 +22,11 @@ class DefaultTensor:
            self.pinned_tensor = torch.empty(self.tensor.shape, pin_memory=True, dtype=self.tensor.dtype)

    def clear(self):
-        del self.tensor
+        attrs = ["tensor"]
+        for attr in attrs:
+            if hasattr(self, attr):
+                delattr(self, attr)
+                setattr(self, attr, None)

    def _calculate_size(self):
        return self.tensor.numel() * self.tensor.element_size()