Refactor Config System (#338)

04812de2 · Yang Yong (雍洋) · GitHub · 6a658f42 · 04812de2 · 04812de2
Unverified Commit 04812de2 authored Sep 29, 2025 by Yang Yong (雍洋) Committed by GitHub Sep 29, 2025
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,7 @@ RUN git clone https://github.com/sgl-project/sglang.git && cd sglang/sgl-kernel
    && make build && make clean
 RUN pip install --no-cache-dir diffusers transformers tokenizers accelerate safetensors opencv-python numpy imageio \
-    imageio-ffmpeg einops loguru qtorch ftfy easydict av
+    imageio-ffmpeg einops loguru qtorch ftfy av
 RUN conda install conda-forge::ffmpeg=8.0.0 -y && ln -s /opt/conda/bin/ffmpeg /usr/bin/ffmpeg && conda clean -all -y

--- a/Dockerfile_cu124
+++ b/Dockerfile_cu124
@@ -26,7 +26,7 @@ RUN git clone https://github.com/sgl-project/sglang.git && cd sglang/sgl-kernel
    && make build && make clean
 RUN pip install --no-cache-dir diffusers transformers tokenizers accelerate safetensors opencv-python numpy imageio \
-    imageio-ffmpeg einops loguru qtorch ftfy easydict
+    imageio-ffmpeg einops loguru qtorch ftfy
 RUN conda install conda-forge::ffmpeg=8.0.0 -y && ln -s /opt/conda/bin/ffmpeg /usr/bin/ffmpeg

--- a/app/gradio_demo.py
+++ b/app/gradio_demo.py
@@ -10,7 +10,6 @@ from datetime import datetime
 import gradio as gr
 import psutil
 import torch
-from easydict import EasyDict
 from loguru import logger
 logger.add(
@@ -258,7 +257,7 @@ for op_name, is_installed in available_attn_ops:
 def run_inference(
    prompt,
    negative_prompt,
-    save_video_path,
+    save_result_path,
    torch_compile,
    infer_steps,
    num_frames,
@@ -382,7 +381,7 @@ def run_inference(
                ],
            ]
-    save_video_path = generate_unique_filename(output_dir)
+    save_result_path = generate_unique_filename(output_dir)
    is_dit_quant = dit_quant_scheme != "bf16"
    is_t5_quant = t5_quant_scheme != "bf16"
@@ -519,11 +518,10 @@ def run_inference(
        prompt=prompt,
        negative_prompt=negative_prompt,
        image_path=image_path,
-        save_video_path=save_video_path,
+        save_result_path=save_result_path,
    )
    config.update({k: v for k, v in vars(args).items()})
-    config = EasyDict(config)
    config.update(model_config)
    config.update(quant_model_config)
@@ -565,7 +563,7 @@ def run_inference(
    cleanup_memory()
-    return save_video_path
+    return save_result_path
 def handle_lazy_load_change(lazy_load_enabled):
@@ -1024,7 +1022,7 @@ def main():
                                info="Total number of frames in the video. More frames result in longer videos.",
                            )
-                        save_video_path = gr.Textbox(
+                        save_result_path = gr.Textbox(
                            label="Output Video Path",
                            value=generate_unique_filename(output_dir),
                            info="Must include .mp4 extension. If left blank or using the default value, a unique filename will be automatically generated.",
@@ -1234,7 +1232,7 @@ def main():
                inputs=[
                    prompt,
                    negative_prompt,
-                    save_video_path,
+                    save_result_path,
                    torch_compile,
                    infer_steps,
                    num_frames,
@@ -1275,7 +1273,7 @@ def main():
                inputs=[
                    prompt,
                    negative_prompt,
-                    save_video_path,
+                    save_result_path,
                    torch_compile,
                    infer_steps,
                    num_frames,

--- a/app/gradio_demo_zh.py
+++ b/app/gradio_demo_zh.py
@@ -10,7 +10,6 @@ from datetime import datetime
 import gradio as gr
 import psutil
 import torch
-from easydict import EasyDict
 from loguru import logger
 logger.add(
@@ -260,7 +259,7 @@ for op_name, is_installed in available_attn_ops:
 def run_inference(
    prompt,
    negative_prompt,
-    save_video_path,
+    save_result_path,
    torch_compile,
    infer_steps,
    num_frames,
@@ -384,7 +383,7 @@ def run_inference(
                ],
            ]
-    save_video_path = generate_unique_filename(output_dir)
+    save_result_path = generate_unique_filename(output_dir)
    is_dit_quant = dit_quant_scheme != "bf16"
    is_t5_quant = t5_quant_scheme != "bf16"
@@ -523,11 +522,10 @@ def run_inference(
        prompt=prompt,
        negative_prompt=negative_prompt,
        image_path=image_path,
-        save_video_path=save_video_path,
+        save_result_path=save_result_path,
    )
    config.update({k: v for k, v in vars(args).items()})
-    config = EasyDict(config)
    config.update(model_config)
    config.update(quant_model_config)
@@ -569,7 +567,7 @@ def run_inference(
    cleanup_memory()
-    return save_video_path
+    return save_result_path
 def handle_lazy_load_change(lazy_load_enabled):
@@ -1028,7 +1026,7 @@ def main():
                                info="视频中的总帧数。更多帧数会产生更长的视频。",
                            )
-                        save_video_path = gr.Textbox(
+                        save_result_path = gr.Textbox(
                            label="输出视频路径",
                            value=generate_unique_filename(output_dir),
                            info="必须包含.mp4扩展名。如果留空或使用默认值，将自动生成唯一文件名。",
@@ -1236,7 +1234,7 @@ def main():
                inputs=[
                    prompt,
                    negative_prompt,
-                    save_video_path,
+                    save_result_path,
                    torch_compile,
                    infer_steps,
                    num_frames,
@@ -1277,7 +1275,7 @@ def main():
                inputs=[
                    prompt,
                    negative_prompt,
-                    save_video_path,
+                    save_result_path,
                    torch_compile,
                    infer_steps,
                    num_frames,

--- a/configs/attentions/wan_i2v_flash.json
+++ b/configs/attentions/wan_i2v_flash.json
@@ -6,7 +6,6 @@
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
-    "seed": 42,
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,

--- a/configs/attentions/wan_i2v_radial.json
+++ b/configs/attentions/wan_i2v_radial.json
@@ -6,7 +6,6 @@
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
-    "seed": 42,
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,

--- a/configs/attentions/wan_i2v_sage.json
+++ b/configs/attentions/wan_i2v_sage.json
@@ -6,7 +6,6 @@
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42,
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,

--- a/configs/attentions/wan_t2v_sparge.json
+++ b/configs/attentions/wan_t2v_sparge.json
@@ -7,7 +7,6 @@
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
-    "seed": 42,
    "sample_guide_scale": 6,
    "sample_shift": 8,
    "enable_cfg": true,

--- a/configs/bench/lightx2v_1.json
+++ b/configs/bench/lightx2v_1.json
 {
    "infer_steps": 40,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42, //1234
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,

--- a/configs/bench/lightx2v_2.json
+++ b/configs/bench/lightx2v_2.json
 {
    "infer_steps": 40,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42, //1234
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,

--- a/configs/bench/lightx2v_3.json
+++ b/configs/bench/lightx2v_3.json
 {
    "infer_steps": 40,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42, //1234
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,

--- a/configs/bench/lightx2v_3_distill.json
+++ b/configs/bench/lightx2v_3_distill.json
 {
    "infer_steps": 4,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42, //1234
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": false,
    "cpu_offload": false,
-    "denoising_step_list": [1000, 750, 500, 250],
+    "denoising_step_list": [
+        1000,
+        750,
+        500,
+        250
+    ],
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl-ActVllm"
    },

--- a/configs/bench/lightx2v_4.json
+++ b/configs/bench/lightx2v_4.json
 {
    "infer_steps": 40,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42, //1234
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,
@@ -16,13 +15,21 @@
    },
    "feature_caching": "Tea",
    "coefficients": [
-        [2.57151496e05, -3.54229917e04, 1.40286849e03, -1.35890334e01, 1.32517977e-01],
+        [
-        [-3.02331670e02, 2.23948934e02, -5.25463970e01, 5.87348440e00, -2.01973289e-01]
+            2.57151496e05,
+            -3.54229917e04,
+            1.40286849e03,
+            -1.35890334e01,
+            1.32517977e-01
+        ],
+        [
+            -3.02331670e02,
+            2.23948934e02,
+            -5.25463970e01,
+            5.87348440e00,
+            -2.01973289e-01
+        ]
    ],
-    //  "coefficients": [
-    // [8.10705460e03, 2.13393892e03, -3.72934672e02, 1.66203073e01, -4.17769401e-02],
-    // [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
-    // ],
    "use_ret_steps": false,
    "teacache_thresh": 0.2,
    "use_tiling_vae": true

--- a/configs/bench/lightx2v_5.json
+++ b/configs/bench/lightx2v_5.json
 {
    "infer_steps": 40,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42,
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,
    "cpu_offload": true,
    "offload_granularity": "block",
-    "offload_ratio": 0.8, //1
+    "offload_ratio": 0.8,
    "t5_cpu_offload": true,
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"

--- a/configs/bench/lightx2v_5_distill.json
+++ b/configs/bench/lightx2v_5_distill.json
 {
    "infer_steps": 4,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42,
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": false,
    "cpu_offload": true,
    "offload_granularity": "block",
-    "offload_ratio": 0.8, //1
+    "offload_ratio": 0.8,
    "t5_cpu_offload": true,
-    "denoising_step_list": [1000, 750, 500, 250],
+    "denoising_step_list": [
+        1000,
+        750,
+        500,
+        250
+    ],
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
    },

--- a/configs/bench/lightx2v_6.json
+++ b/configs/bench/lightx2v_6.json
 {
    "infer_steps": 40,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42,
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": true,
    "cpu_offload": true,
    "offload_granularity": "block",
-    "offload_ratio": 0.8, //1
+    "offload_ratio": 0.8,
    "t5_cpu_offload": true,
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"

--- a/configs/bench/lightx2v_6_distill.json
+++ b/configs/bench/lightx2v_6_distill.json
 {
    "infer_steps": 4,
    "target_video_length": 81,
-    "target_height": 480, // 720
+    "target_height": 480,
-    "target_width": 832, // 1280
+    "target_width": 832,
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
-    "seed": 42,
    "sample_guide_scale": 5,
    "sample_shift": 5,
    "enable_cfg": false,
    "cpu_offload": true,
    "offload_granularity": "block",
-    "offload_ratio": 0.8, //1
+    "offload_ratio": 0.8,
    "t5_cpu_offload": true,
-    "denoising_step_list": [1000, 750, 500, 250],
+    "denoising_step_list": [
+        1000,
+        750,
+        500,
+        250
+    ],
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
    },

--- a/configs/caching/adacache/wan_t2v_ada.json
+++ b/configs/caching/adacache/wan_t2v_ada.json
@@ -7,7 +7,6 @@
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
-    "seed": 42,
    "sample_guide_scale": 6,
    "sample_shift": 8,
    "enable_cfg": true,

--- a/configs/caching/custom/wan_i2v_custom_720p.json
+++ b/configs/caching/custom/wan_i2v_custom_720p.json
@@ -6,15 +6,26 @@
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
-    "seed": 442,
    "sample_guide_scale": 5,
    "sample_shift": 3,
    "enable_cfg": true,
    "cpu_offload": false,
    "feature_caching": "Custom",
    "coefficients": [
-        [8.10705460e03, 2.13393892e03, -3.72934672e02, 1.66203073e01, -4.17769401e-02],
+        [
-        [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
+            8.10705460e03,
+            2.13393892e03,
+            -3.72934672e02,
+            1.66203073e01,
+            -4.17769401e-02
+        ],
+        [
+            -114.36346466,
+            65.26524496,
+            -18.82220707,
+            4.91518089,
+            -0.23412683
+        ]
    ],
    "use_ret_steps": false,
    "teacache_thresh": 0.26

--- a/configs/caching/custom/wan_t2v_custom_14b.json
+++ b/configs/caching/custom/wan_t2v_custom_14b.json
@@ -7,15 +7,26 @@
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
-    "seed": 42,
    "sample_guide_scale": 6,
    "sample_shift": 8,
    "enable_cfg": true,
    "cpu_offload": false,
    "feature_caching": "Custom",
    "coefficients": [
-        [-3.03318725e05, 4.90537029e04, -2.65530556e03, 5.87365115e01, -3.15583525e-01],
+        [
-        [-5784.54975374, 5449.50911966, -1811.16591783, 256.27178429, -13.02252404]
+            -3.03318725e05,
+            4.90537029e04,
+            -2.65530556e03,
+            5.87365115e01,
+            -3.15583525e-01
+        ],
+        [
+            -5784.54975374,
+            5449.50911966,
+            -1811.16591783,
+            256.27178429,
+            -13.02252404
+        ]
    ],
    "use_ret_steps": false,
    "teacache_thresh": 0.26