Refactor Config System (#338)

04812de2 · Yang Yong (雍洋) · GitHub · 6a658f42 · 04812de2 · 04812de2
Unverified Commit 04812de2 authored Sep 29, 2025 by Yang Yong (雍洋) Committed by GitHub Sep 29, 2025
20 changed files
--- a/lightx2v/models/schedulers/wan/step_distill/scheduler.py
+++ b/lightx2v/models/schedulers/wan/step_distill/scheduler.py
@@ -9,25 +9,16 @@ from lightx2v.models.schedulers.wan.scheduler import WanScheduler
 class WanStepDistillScheduler(WanScheduler):
    def __init__(self, config):
        super().__init__(config)
-        self.denoising_step_list = config.denoising_step_list
+        self.denoising_step_list = config["denoising_step_list"]
        self.infer_steps = len(self.denoising_step_list)
-        self.sample_shift = self.config.sample_shift
+        self.sample_shift = self.config["sample_shift"]
        self.num_train_timesteps = 1000
        self.sigma_max = 1.0
        self.sigma_min = 0.0
-    def prepare(self, image_encoder_output):
+    def prepare(self, seed, latent_shape, image_encoder_output=None):
-        self.generator = torch.Generator(device=self.device)
+        self.prepare_latents(seed, latent_shape, dtype=torch.float32)
-        self.generator.manual_seed(self.config.seed)
-        self.prepare_latents(self.config.target_shape, dtype=torch.float32)
-        if self.config.task in ["t2v"]:
-            self.seq_len = math.ceil((self.config.target_shape[2] * self.config.target_shape[3]) / (self.config.patch_size[1] * self.config.patch_size[2]) * self.config.target_shape[1])
-        elif self.config.task in ["i2v"]:
-            self.seq_len = self.config.lat_h * self.config.lat_w // (self.config.patch_size[1] * self.config.patch_size[2]) * self.config.target_shape[1]
        self.set_denoising_timesteps(device=self.device)
    def set_denoising_timesteps(self, device: Union[str, torch.device] = None):
@@ -40,8 +31,8 @@ class WanStepDistillScheduler(WanScheduler):
        self.timesteps = self.timesteps[self.denoising_step_index].to(device)
        self.sigmas = self.sigmas[self.denoising_step_index].to("cpu")
-    def reset(self):
+    def reset(self, seed, latent_shape, step_index=None):
-        self.prepare_latents(self.config.target_shape, dtype=torch.float32)
+        self.prepare_latents(seed, latent_shape, dtype=torch.float32)
    def add_noise(self, original_samples, noise, sigma):
        sample = (1 - sigma) * original_samples + sigma * noise
@@ -61,7 +52,7 @@ class WanStepDistillScheduler(WanScheduler):
 class Wan22StepDistillScheduler(WanStepDistillScheduler):
    def __init__(self, config):
        super().__init__(config)
-        self.boundary_step_index = config.boundary_step_index
+        self.boundary_step_index = config["boundary_step_index"]
    def set_denoising_timesteps(self, device: Union[str, torch.device] = None):
        super().set_denoising_timesteps(device)

--- a/lightx2v/models/video_encoders/hf/wan/vae.py
+++ b/lightx2v/models/video_encoders/hf/wan/vae.py
@@ -546,7 +546,6 @@ class WanVAE_(nn.Module):
            self.temperal_upsample,
            dropout,
        )
-        self.clear_cache()
    def forward(self, x):
        mu, log_var = self.encode(x)

--- a/lightx2v/models/video_encoders/hf/wan/vae_sf.py
+++ b/lightx2v/models/video_encoders/hf/wan/vae_sf.py
@@ -30,6 +30,7 @@ class WanSFVAE:
        # init model
        self.model = _video_vae(pretrained_path=vae_pth, z_dim=z_dim, cpu_offload=cpu_offload, dtype=dtype, load_from_rank0=load_from_rank0).eval().requires_grad_(False).to(device).to(dtype)
+        self.model.clear_cache()
    def to_cpu(self):
        self.model.encoder = self.model.encoder.to("cpu")

--- a/lightx2v/server/api.py
+++ b/lightx2v/server/api.py
@@ -114,7 +114,7 @@ class ApiServer:
                return TaskResponse(
                    task_id=task_id,
                    task_status="pending",
-                    save_video_path=message.save_video_path,
+                    save_result_path=message.save_result_path,
                )
            except RuntimeError as e:
                raise HTTPException(status_code=503, detail=str(e))
@@ -126,7 +126,7 @@ class ApiServer:
        async def create_task_form(
            image_file: UploadFile = File(...),
            prompt: str = Form(default=""),
-            save_video_path: str = Form(default=""),
+            save_result_path: str = Form(default=""),
            use_prompt_enhancer: bool = Form(default=False),
            negative_prompt: str = Form(default=""),
            num_fragments: int = Form(default=1),
@@ -166,7 +166,7 @@ class ApiServer:
                negative_prompt=negative_prompt,
                image_path=image_path,
                num_fragments=num_fragments,
-                save_video_path=save_video_path,
+                save_result_path=save_result_path,
                infer_steps=infer_steps,
                target_video_length=target_video_length,
                seed=seed,
@@ -183,7 +183,7 @@ class ApiServer:
                return TaskResponse(
                    task_id=task_id,
                    task_status="pending",
-                    save_video_path=message.save_video_path,
+                    save_result_path=message.save_result_path,
                )
            except RuntimeError as e:
                raise HTTPException(status_code=503, detail=str(e))
@@ -228,13 +228,13 @@ class ApiServer:
                if task_status.get("status") != TaskStatus.COMPLETED.value:
                    raise HTTPException(status_code=404, detail="Task not completed")
-                save_video_path = task_status.get("save_video_path")
+                save_result_path = task_status.get("save_result_path")
-                if not save_video_path:
+                if not save_result_path:
                    raise HTTPException(status_code=404, detail="Task result file does not exist")
-                full_path = Path(save_video_path)
+                full_path = Path(save_result_path)
                if not full_path.is_absolute():
-                    full_path = self.file_service.output_video_dir / save_video_path
+                    full_path = self.file_service.output_video_dir / save_result_path
                return self._stream_file_response(full_path)
@@ -364,7 +364,7 @@ class ApiServer:
            result = await self.video_service.generate_video_with_stop_event(message, task_info.stop_event)
            if result:
-                task_manager.complete_task(task_id, result.save_video_path)
+                task_manager.complete_task(task_id, result.save_result_path)
                logger.info(f"Task {task_id} completed successfully")
            else:
                if task_info.stop_event.is_set():

--- a/lightx2v/server/schema.py
+++ b/lightx2v/server/schema.py
@@ -10,7 +10,7 @@ class TaskRequest(BaseModel):
    negative_prompt: str = Field("", description="Negative prompt")
    image_path: str = Field("", description="Base64 encoded image or URL")
    num_fragments: int = Field(1, description="Number of fragments")
-    save_video_path: str = Field("", description="Save video path (optional, defaults to task_id.mp4)")
+    save_result_path: str = Field("", description="Save video path (optional, defaults to task_id.mp4)")
    infer_steps: int = Field(5, description="Inference steps")
    target_video_length: int = Field(81, description="Target video length")
    seed: int = Field(42, description="Random seed")
@@ -19,8 +19,8 @@ class TaskRequest(BaseModel):
    def __init__(self, **data):
        super().__init__(**data)
-        if not self.save_video_path:
+        if not self.save_result_path:
-            self.save_video_path = f"{self.task_id}.mp4"
+            self.save_result_path = f"{self.task_id}.mp4"
    def get(self, key, default=None):
        return getattr(self, key, default)
@@ -33,7 +33,7 @@ class TaskStatusMessage(BaseModel):
 class TaskResponse(BaseModel):
    task_id: str
    task_status: str
-    save_video_path: str
+    save_result_path: str
 class StopTaskResponse(BaseModel):

--- a/lightx2v/server/service.py
+++ b/lightx2v/server/service.py
@@ -179,10 +179,10 @@ class FileService:
        return file_path
-    def get_output_path(self, save_video_path: str) -> Path:
+    def get_output_path(self, save_result_path: str) -> Path:
-        video_path = Path(save_video_path)
+        video_path = Path(save_result_path)
        if not video_path.is_absolute():
-            return self.output_video_dir / save_video_path
+            return self.output_video_dir / save_result_path
        return video_path
    async def cleanup(self):
@@ -260,7 +260,7 @@ class TorchrunInferenceWorker:
                return {
                    "task_id": task_data["task_id"],
                    "status": "success",
-                    "save_video_path": task_data.get("video_path", task_data["save_video_path"]),
+                    "save_result_path": task_data.get("video_path", task_data["save_result_path"]),
                    "message": "Inference completed",
                }
            else:
@@ -418,9 +418,9 @@ class VideoGenerationService:
                logger.info(f"Task {message.task_id} audio path: {task_data['audio_path']}")
-            actual_save_path = self.file_service.get_output_path(message.save_video_path)
+            actual_save_path = self.file_service.get_output_path(message.save_result_path)
-            task_data["save_video_path"] = str(actual_save_path)
+            task_data["save_result_path"] = str(actual_save_path)
-            task_data["video_path"] = message.save_video_path
+            task_data["video_path"] = message.save_result_path
            result = await self.inference_service.submit_task_async(task_data)
@@ -434,7 +434,7 @@ class VideoGenerationService:
                return TaskResponse(
                    task_id=message.task_id,
                    task_status="completed",
-                    save_video_path=message.save_video_path,  # Return original path
+                    save_result_path=message.save_result_path,  # Return original path
                )
            else:
                error_msg = result.get("error", "Inference failed")

--- a/lightx2v/server/task_manager.py
+++ b/lightx2v/server/task_manager.py
@@ -25,7 +25,7 @@ class TaskInfo:
    start_time: datetime = field(default_factory=datetime.now)
    end_time: Optional[datetime] = None
    error: Optional[str] = None
-    save_video_path: Optional[str] = None
+    save_result_path: Optional[str] = None
    stop_event: threading.Event = field(default_factory=threading.Event)
    thread: Optional[threading.Thread] = None
@@ -54,7 +54,7 @@ class TaskManager:
                raise RuntimeError(f"Task queue is full (max {self.max_queue_size} tasks)")
            task_id = getattr(message, "task_id", str(uuid.uuid4()))
-            task_info = TaskInfo(task_id=task_id, status=TaskStatus.PENDING, message=message, save_video_path=getattr(message, "save_video_path", None))
+            task_info = TaskInfo(task_id=task_id, status=TaskStatus.PENDING, message=message, save_result_path=getattr(message, "save_result_path", None))
            self._tasks[task_id] = task_info
            self.total_tasks += 1
@@ -76,7 +76,7 @@ class TaskManager:
            return task
-    def complete_task(self, task_id: str, save_video_path: Optional[str] = None):
+    def complete_task(self, task_id: str, save_result_path: Optional[str] = None):
        with self._lock:
            if task_id not in self._tasks:
                logger.warning(f"Task {task_id} not found for completion")
@@ -85,8 +85,8 @@ class TaskManager:
            task = self._tasks[task_id]
            task.status = TaskStatus.COMPLETED
            task.end_time = datetime.now()
-            if save_video_path:
+            if save_result_path:
-                task.save_video_path = save_video_path
+                task.save_result_path = save_result_path
            self.completed_tasks += 1
@@ -138,7 +138,7 @@ class TaskManager:
        if not task:
            return None
-        return {"task_id": task.task_id, "status": task.status.value, "start_time": task.start_time, "end_time": task.end_time, "error": task.error, "save_video_path": task.save_video_path}
+        return {"task_id": task.task_id, "status": task.status.value, "start_time": task.start_time, "end_time": task.end_time, "error": task.error, "save_result_path": task.save_result_path}
    def get_all_tasks(self):
        with self._lock:

--- a/lightx2v/utils/custom_compiler.py
+++ b/lightx2v/utils/custom_compiler.py
@@ -56,12 +56,14 @@ def compiled_method(compile_options: Optional[Dict] = None):
        def _select_graph(graph_name: str):
            if graph_name not in state["compiled_graphs"]:
-                raise ValueError(f"Graph '{graph_name}' not found. Available graphs: {list(state['compiled_graphs'].keys())}")
+                logger.warning(f"[Compile] Graph '{graph_name}' not found. Available graphs: {list(state['compiled_graphs'].keys())}, returning to original function.")
+                state["selected_graph"] = None
-            logger.info(f"[Compile] Selecting graph '{graph_name}' for {func_name}")
+                state["selected_compiled"] = None
-            state["selected_graph"] = graph_name
+            else:
-            state["selected_compiled"] = state["compiled_graphs"][graph_name]
+                logger.info(f"[Compile] Selecting graph '{graph_name}' for {func_name}")
-            logger.info(f"[Compile] {func_name} will now use graph '{graph_name}' for inference")
+                state["selected_graph"] = graph_name
+                state["selected_compiled"] = state["compiled_graphs"][graph_name]
+                logger.info(f"[Compile] {func_name} will now use graph '{graph_name}' for inference")
        def _unselect_graph():
            logger.info(f"[Compile] Unselecting graph for {func_name}, returning to original function")

--- a/lightx2v/utils/input_info.py
+++ b/lightx2v/utils/input_info.py
+import inspect
+from dataclasses import dataclass, field
+@dataclass
+class T2VInputInfo:
+    seed: int = field(default_factory=int)
+    prompt: str = field(default_factory=str)
+    prompt_enhanced: str = field(default_factory=str)
+    negative_prompt: str = field(default_factory=str)
+    save_result_path: str = field(default_factory=str)
+    return_result_tensor: bool = field(default_factory=lambda: False)
+    # shape related
+    latent_shape: list = field(default_factory=list)
+    target_shape: int = field(default_factory=int)
+@dataclass
+class I2VInputInfo:
+    seed: int = field(default_factory=int)
+    prompt: str = field(default_factory=str)
+    prompt_enhanced: str = field(default_factory=str)
+    negative_prompt: str = field(default_factory=str)
+    image_path: str = field(default_factory=str)
+    save_result_path: str = field(default_factory=str)
+    return_result_tensor: bool = field(default_factory=lambda: False)
+    # shape related
+    original_shape: list = field(default_factory=list)
+    resized_shape: list = field(default_factory=list)
+    latent_shape: list = field(default_factory=list)
+    target_shape: int = field(default_factory=int)
+@dataclass
+class Flf2vInputInfo:
+    seed: int = field(default_factory=int)
+    prompt: str = field(default_factory=str)
+    prompt_enhanced: str = field(default_factory=str)
+    negative_prompt: str = field(default_factory=str)
+    image_path: str = field(default_factory=str)
+    last_frame_path: str = field(default_factory=str)
+    save_result_path: str = field(default_factory=str)
+    return_result_tensor: bool = field(default_factory=lambda: False)
+    # shape related
+    original_shape: list = field(default_factory=list)
+    resized_shape: list = field(default_factory=list)
+    latent_shape: list = field(default_factory=list)
+    target_shape: int = field(default_factory=int)
+# Need Check
+@dataclass
+class VaceInputInfo:
+    seed: int = field(default_factory=int)
+    prompt: str = field(default_factory=str)
+    prompt_enhanced: str = field(default_factory=str)
+    negative_prompt: str = field(default_factory=str)
+    src_ref_images: str = field(default_factory=str)
+    src_video: str = field(default_factory=str)
+    src_mask: str = field(default_factory=str)
+    save_result_path: str = field(default_factory=str)
+    return_result_tensor: bool = field(default_factory=lambda: False)
+    # shape related
+    original_shape: list = field(default_factory=list)
+    resized_shape: list = field(default_factory=list)
+    latent_shape: list = field(default_factory=list)
+    target_shape: int = field(default_factory=int)
+@dataclass
+class S2VInputInfo:
+    seed: int = field(default_factory=int)
+    prompt: str = field(default_factory=str)
+    prompt_enhanced: str = field(default_factory=str)
+    negative_prompt: str = field(default_factory=str)
+    image_path: str = field(default_factory=str)
+    audio_path: str = field(default_factory=str)
+    audio_num: int = field(default_factory=int)
+    with_mask: bool = field(default_factory=lambda: False)
+    save_result_path: str = field(default_factory=str)
+    return_result_tensor: bool = field(default_factory=lambda: False)
+    # shape related
+    original_shape: list = field(default_factory=list)
+    resized_shape: list = field(default_factory=list)
+    latent_shape: list = field(default_factory=list)
+    target_shape: int = field(default_factory=int)
+# Need Check
+@dataclass
+class AnimateInputInfo:
+    seed: int = field(default_factory=int)
+    prompt: str = field(default_factory=str)
+    prompt_enhanced: str = field(default_factory=str)
+    negative_prompt: str = field(default_factory=str)
+    image_path: str = field(default_factory=str)
+    save_result_path: str = field(default_factory=str)
+    return_result_tensor: bool = field(default_factory=lambda: False)
+    # shape related
+    original_shape: list = field(default_factory=list)
+    resized_shape: list = field(default_factory=list)
+    latent_shape: list = field(default_factory=list)
+    target_shape: int = field(default_factory=int)
+def set_input_info(args):
+    if args.task == "t2v":
+        input_info = T2VInputInfo(
+            seed=args.seed,
+            prompt=args.prompt,
+            negative_prompt=args.negative_prompt,
+            save_result_path=args.save_result_path,
+            return_result_tensor=args.return_result_tensor,
+        )
+    elif args.task == "i2v":
+        input_info = I2VInputInfo(
+            seed=args.seed,
+            prompt=args.prompt,
+            negative_prompt=args.negative_prompt,
+            image_path=args.image_path,
+            save_result_path=args.save_result_path,
+            return_result_tensor=args.return_result_tensor,
+        )
+    elif args.task == "flf2v":
+        input_info = Flf2vInputInfo(
+            seed=args.seed,
+            prompt=args.prompt,
+            negative_prompt=args.negative_prompt,
+            image_path=args.image_path,
+            last_frame_path=args.last_frame_path,
+            save_result_path=args.save_result_path,
+            return_result_tensor=args.return_result_tensor,
+        )
+    elif args.task == "vace":
+        input_info = VaceInputInfo(
+            seed=args.seed,
+            prompt=args.prompt,
+            negative_prompt=args.negative_prompt,
+            src_ref_images=args.src_ref_images,
+            src_video=args.src_video,
+            src_mask=args.src_mask,
+            save_result_path=args.save_result_path,
+            return_result_tensor=args.return_result_tensor,
+        )
+    elif args.task == "s2v":
+        input_info = S2VInputInfo(
+            seed=args.seed,
+            prompt=args.prompt,
+            negative_prompt=args.negative_prompt,
+            image_path=args.image_path,
+            audio_path=args.audio_path,
+            save_result_path=args.save_result_path,
+            return_result_tensor=args.return_result_tensor,
+        )
+    elif args.task == "animate":
+        input_info = AnimateInputInfo(
+            seed=args.seed,
+            prompt=args.prompt,
+            negative_prompt=args.negative_prompt,
+            image_path=args.image_path,
+            save_result_path=args.save_result_path,
+            return_result_tensor=args.return_result_tensor,
+        )
+    else:
+        raise ValueError(f"Unsupported task: {args.task}")
+    return input_info
+def get_all_input_info_keys():
+    all_keys = set()
+    current_module = inspect.currentframe().f_globals
+    for name, obj in current_module.items():
+        if inspect.isclass(obj) and name.endswith("InputInfo") and hasattr(obj, "__dataclass_fields__"):
+            all_keys.update(obj.__dataclass_fields__.keys())
+    return all_keys
+# 创建包含所有InputInfo字段的集合
+ALL_INPUT_INFO_KEYS = get_all_input_info_keys()
--- a/lightx2v/utils/lockable_dict.py
+++ b/lightx2v/utils/lockable_dict.py
+from contextlib import contextmanager
+from typing import Any, Iterable, Mapping
+class LockableDict(dict):
+    """
+    A lockable/unlockable dictionary. After locking, any in-place modifications will raise TypeError.
+    By default auto_wrap=True, which recursively converts nested dict objects in dict/list/tuple/set
+    to LockableDict, so that recursive locking works consistently both internally and externally.
+    """
+    def __init__(self, *args, auto_wrap: bool = True, **kwargs):
+        self._locked: bool = False
+        self._auto_wrap: bool = auto_wrap
+        # Build with temporary dict, then wrap uniformly before writing to self, avoiding bypass of __setitem__
+        tmp = dict(*args, **kwargs)
+        for k, v in tmp.items():
+            dict.__setitem__(self, k, self._wrap(v))
+    # ========== Public API ==========
+    @property
+    def locked(self) -> bool:
+        return self._locked
+    def lock(self, recursive: bool = True) -> None:
+        """Lock the dictionary. When recursive=True, also recursively locks nested LockableDict objects."""
+        self._locked = True
+        if recursive:
+            for v in self.values():
+                if isinstance(v, LockableDict):
+                    v.lock(True)
+    def unlock(self, recursive: bool = True) -> None:
+        """Unlock the dictionary. When recursive=True, also recursively unlocks nested LockableDict objects."""
+        self._locked = False
+        if recursive:
+            for v in self.values():
+                if isinstance(v, LockableDict):
+                    v.unlock(True)
+    @contextmanager
+    def temporarily_unlocked(self, recursive: bool = True):
+        """
+        Temporarily unlock in context manager form, restoring original state on exit.
+        Typical usage:
+            with d.temporarily_unlocked():
+                d["x"] = 1
+        """
+        prev = self._locked
+        if prev and recursive:
+            # First temporarily unlock all child nodes as well
+            stack: list[LockableDict] = []
+            def _collect(node: "LockableDict"):
+                for v in node.values():
+                    if isinstance(v, LockableDict):
+                        stack.append(v)
+                        _collect(v)
+            _collect(self)
+            self._locked = False
+            for n in stack:
+                n._locked = False
+            try:
+                yield self
+            finally:
+                self._locked = prev
+                for n in stack:
+                    n._locked = prev
+        else:
+            self._locked = False
+            try:
+                yield self
+            finally:
+                self._locked = prev
+    def copy(self) -> "LockableDict":
+        new = LockableDict(auto_wrap=self._auto_wrap)
+        for k, v in self.items():
+            dict.__setitem__(new, k, v)
+        new._locked = self._locked
+        return new
+    # ========== In-place modification interception ==========
+    def __setitem__(self, key, value) -> None:
+        self._ensure_unlocked()
+        dict.__setitem__(self, key, self._wrap(value))
+    def __delitem__(self, key) -> None:
+        self._ensure_unlocked()
+        dict.__delitem__(self, key)
+    def clear(self) -> None:
+        self._ensure_unlocked()
+        dict.clear(self)
+    def pop(self, k, d: Any = ...):
+        self._ensure_unlocked()
+        if d is ...:
+            return dict.pop(self, k)
+        return dict.pop(self, k, d)
+    def popitem(self):
+        self._ensure_unlocked()
+        return dict.popitem(self)
+    def setdefault(self, key, default=None):
+        # If key doesn't exist, setdefault will write, need to check lock
+        if key not in self:
+            self._ensure_unlocked()
+            default = self._wrap(default)
+        return dict.setdefault(self, key, default)
+    def update(self, other: Mapping | Iterable, **kwargs) -> None:
+        self._ensure_unlocked()
+        if isinstance(other, Mapping):
+            items = list(other.items())
+        else:
+            items = list(other)
+        for k, v in items:
+            dict.__setitem__(self, k, self._wrap(v))
+        for k, v in kwargs.items():
+            dict.__setitem__(self, k, self._wrap(v))
+    # Python 3.9 in-place union: d |= x
+    def __ior__(self, other):
+        self.update(other)
+        return self
+    # ========== Internal utilities ==========
+    def _ensure_unlocked(self) -> None:
+        if self._locked:
+            raise TypeError("Dictionary is locked, current operation not allowed.")
+    def _wrap(self, value):
+        if not self._auto_wrap:
+            return value
+        if isinstance(value, LockableDict):
+            return value
+        if isinstance(value, dict):
+            return LockableDict(value, auto_wrap=True)
+        if isinstance(value, list):
+            return [self._wrap(v) for v in value]
+        if isinstance(value, tuple):
+            return tuple(self._wrap(v) for v in value)
+        if isinstance(value, set):
+            return {self._wrap(v) for v in value}
+        return value
+if __name__ == "__main__":
+    d = LockableDict({"a": 1, "b": 2})
+    d["b"] = 3
+    print(d)
+    d.lock()
+    print(d)
+    # d["a"] = 3
+    # print(d)
+    # d.unlock()
+    # print(d)
+    # d["a"] = 3
+    # print(d)
+    with d.temporarily_unlocked():
+        d["a"] = 3
+    print(d)
+    d["a"] = 4
--- a/lightx2v/utils/set_config.py
+++ b/lightx2v/utils/set_config.py
@@ -2,10 +2,12 @@ import json
 import os
 import torch.distributed as dist
-from easydict import EasyDict
 from loguru import logger
 from torch.distributed.tensor.device_mesh import init_device_mesh
+from lightx2v.utils.input_info import ALL_INPUT_INFO_KEYS
+from lightx2v.utils.lockable_dict import LockableDict
 def get_default_config():
    default_config = {
@@ -26,93 +28,71 @@ def get_default_config():
        "cfg_parallel": False,
        "enable_cfg": False,
        "use_image_encoder": True,
-        "lat_h": None,
-        "lat_w": None,
-        "tgt_h": None,
-        "tgt_w": None,
-        "target_shape": None,
-        "return_video": False,
-        "audio_num": None,
-        "person_num": None,
    }
+    default_config = LockableDict(default_config)
    return default_config
 def set_config(args):
+    assert not (args.save_result_path and args.return_result_tensor), "save_result_path and return_result_tensor cannot be set at the same time"
    config = get_default_config()
-    config.update({k: v for k, v in vars(args).items()})
+    config.update({k: v for k, v in vars(args).items() if k not in ALL_INPUT_INFO_KEYS})
-    config = EasyDict(config)
-    with open(config.config_json, "r") as f:
+    with open(config["config_json"], "r") as f:
        config_json = json.load(f)
    config.update(config_json)
-    if os.path.exists(os.path.join(config.model_path, "config.json")):
+    if os.path.exists(os.path.join(config["model_path"], "config.json")):
-        with open(os.path.join(config.model_path, "config.json"), "r") as f:
+        with open(os.path.join(config["model_path"], "config.json"), "r") as f:
            model_config = json.load(f)
        config.update(model_config)
-    elif os.path.exists(os.path.join(config.model_path, "low_noise_model", "config.json")):  # 需要一个更优雅的update方法
+    elif os.path.exists(os.path.join(config["model_path"], "low_noise_model", "config.json")):  # 需要一个更优雅的update方法
-        with open(os.path.join(config.model_path, "low_noise_model", "config.json"), "r") as f:
+        with open(os.path.join(config["model_path"], "low_noise_model", "config.json"), "r") as f:
            model_config = json.load(f)
        config.update(model_config)
-    elif os.path.exists(os.path.join(config.model_path, "distill_models", "low_noise_model", "config.json")):  # 需要一个更优雅的update方法
+    elif os.path.exists(os.path.join(config["model_path"], "distill_models", "low_noise_model", "config.json")):  # 需要一个更优雅的update方法
-        with open(os.path.join(config.model_path, "distill_models", "low_noise_model", "config.json"), "r") as f:
+        with open(os.path.join(config["model_path"], "distill_models", "low_noise_model", "config.json"), "r") as f:
            model_config = json.load(f)
        config.update(model_config)
-    elif os.path.exists(os.path.join(config.model_path, "original", "config.json")):
+    elif os.path.exists(os.path.join(config["model_path"], "original", "config.json")):
-        with open(os.path.join(config.model_path, "original", "config.json"), "r") as f:
+        with open(os.path.join(config["model_path"], "original", "config.json"), "r") as f:
            model_config = json.load(f)
        config.update(model_config)
    # load quantized config
    if config.get("dit_quantized_ckpt", None) is not None:
-        config_path = os.path.join(config.dit_quantized_ckpt, "config.json")
+        config_path = os.path.join(config["dit_quantized_ckpt"], "config.json")
        if os.path.exists(config_path):
            with open(config_path, "r") as f:
                model_config = json.load(f)
            config.update(model_config)
-    if config.task == "i2v":
+    if config["task"] in ["i2v", "s2v"]:
-        if config.target_video_length % config.vae_stride[0] != 1:
+        if config["target_video_length"] % config["vae_stride"][0] != 1:
-            logger.warning(f"`num_frames - 1` has to be divisible by {config.vae_stride[0]}. Rounding to the nearest number.")
+            logger.warning(f"`num_frames - 1` has to be divisible by {config['vae_stride'][0]}. Rounding to the nearest number.")
-            config.target_video_length = config.target_video_length // config.vae_stride[0] * config.vae_stride[0] + 1
+            config["target_video_length"] = config["target_video_length"] // config["vae_stride"][0] * config["vae_stride"][0] + 1
-    if config.audio_path:
-        if os.path.isdir(config.audio_path):
-            logger.info(f"audio_path is a directory, loading config.json from {config.audio_path}")
-            audio_config_path = os.path.join(config.audio_path, "config.json")
-            assert os.path.exists(audio_config_path), "config.json not found in audio_path"
-            with open(audio_config_path, "r") as f:
-                audio_config = json.load(f)
-            for talk_object in audio_config["talk_objects"]:
-                talk_object["audio"] = os.path.join(config.audio_path, talk_object["audio"])
-                talk_object["mask"] = os.path.join(config.audio_path, talk_object["mask"])
-            config.update(audio_config)
-        else:
-            logger.info(f"audio_path is a file: {config.audio_path}")
-    assert not (config.save_video_path and config.return_video), "save_video_path and return_video cannot be set at the same time"
    return config
 def set_parallel_config(config):
-    if config.parallel:
+    if config["parallel"]:
-        cfg_p_size = config.parallel.get("cfg_p_size", 1)
+        cfg_p_size = config["parallel"].get("cfg_p_size", 1)
-        seq_p_size = config.parallel.get("seq_p_size", 1)
+        seq_p_size = config["parallel"].get("seq_p_size", 1)
        assert cfg_p_size * seq_p_size == dist.get_world_size(), f"cfg_p_size * seq_p_size must be equal to world_size"
        config["device_mesh"] = init_device_mesh("cuda", (cfg_p_size, seq_p_size), mesh_dim_names=("cfg_p", "seq_p"))
-        if config.parallel and config.parallel.get("seq_p_size", False) and config.parallel.seq_p_size > 1:
+        if config["parallel"] and config["parallel"].get("seq_p_size", False) and config["parallel"]["seq_p_size"] > 1:
            config["seq_parallel"] = True
-        if config.get("enable_cfg", False) and config.parallel and config.parallel.get("cfg_p_size", False) and config.parallel.cfg_p_size > 1:
+        if config.get("enable_cfg", False) and config["parallel"] and config["parallel"].get("cfg_p_size", False) and config["parallel"]["cfg_p_size"] > 1:
            config["cfg_parallel"] = True
 def print_config(config):
    config_to_print = config.copy()
    config_to_print.pop("device_mesh", None)
-    if config.parallel:
+    if config["parallel"]:
        if dist.get_rank() == 0:
            logger.info(f"config:\n{json.dumps(config_to_print, ensure_ascii=False, indent=4)}")
    else:

--- a/lightx2v/utils/utils.py
+++ b/lightx2v/utils/utils.py
@@ -304,13 +304,13 @@ def find_torch_model_path(config, ckpt_config_key=None, filename=None, subdir=["
        return config.get(ckpt_config_key)
    paths_to_check = [
-        os.path.join(config.model_path, filename),
+        os.path.join(config["model_path"], filename),
    ]
    if isinstance(subdir, list):
        for sub in subdir:
-            paths_to_check.insert(0, os.path.join(config.model_path, sub, filename))
+            paths_to_check.insert(0, os.path.join(config["model_path"], sub, filename))
    else:
-        paths_to_check.insert(0, os.path.join(config.model_path, subdir, filename))
+        paths_to_check.insert(0, os.path.join(config["model_path"], subdir, filename))
    for path in paths_to_check:
        if os.path.exists(path):

--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,6 @@ loguru
 sgl-kernel
 qtorch
 ftfy
-easydict
 gradio
 aiohttp
 pydantic
@@ -31,3 +30,4 @@ requests
 alibabacloud_dypnsapi20170525==1.2.2
 redis==6.4.0
 tos
+decord
--- a/requirements_win.txt
+++ b/requirements_win.txt
@@ -13,7 +13,6 @@ einops
 loguru
 qtorch
 ftfy
-easydict
 gradio
 aiohttp
 pydantic
--- a/scripts/bench/run_lightx2v_1.sh
+++ b/scripts/bench/run_lightx2v_1.sh
@@ -37,4 +37,4 @@ python -m lightx2v.infer \
 --prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
--save_video_path ${lightx2v_path}/save_results/lightx2v_1.mp4
+--save_result_path ${lightx2v_path}/save_results/lightx2v_1.mp4
--- a/scripts/bench/run_lightx2v_2.sh
+++ b/scripts/bench/run_lightx2v_2.sh
@@ -37,4 +37,4 @@ python -m lightx2v.infer \
 --prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
--save_video_path ${lightx2v_path}/save_results/lightx2v_2.mp4
+--save_result_path ${lightx2v_path}/save_results/lightx2v_2.mp4
--- a/scripts/bench/run_lightx2v_3.sh
+++ b/scripts/bench/run_lightx2v_3.sh
@@ -37,4 +37,4 @@ python -m lightx2v.infer \
 --prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
--save_video_path ${lightx2v_path}/save_results/lightx2v_3.mp4
+--save_result_path ${lightx2v_path}/save_results/lightx2v_3.mp4
--- a/scripts/bench/run_lightx2v_3_distill.sh
+++ b/scripts/bench/run_lightx2v_3_distill.sh
@@ -37,4 +37,4 @@ python -m lightx2v.infer \
 --prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
--save_video_path ${lightx2v_path}/save_results/lightx2v_3_distill.mp4
+--save_result_path ${lightx2v_path}/save_results/lightx2v_3_distill.mp4
--- a/scripts/bench/run_lightx2v_4.sh
+++ b/scripts/bench/run_lightx2v_4.sh
@@ -37,4 +37,4 @@ python -m lightx2v.infer \
 --prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
--save_video_path ${lightx2v_path}/save_results/lightx2v_4.mp4
+--save_result_path ${lightx2v_path}/save_results/lightx2v_4.mp4
--- a/scripts/bench/run_lightx2v_5.sh
+++ b/scripts/bench/run_lightx2v_5.sh
@@ -37,4 +37,4 @@ python -m lightx2v.infer \
 --prompt "A close-up cinematic view of a person cooking in a warm,sunlit kitchen, using a wooden spatula to stir-fry a colorful mix of freshvegetables—carrots, broccoli, and bell peppers—in a black frying pan on amodern induction stove. The scene captures the glistening texture of thevegetables, steam gently rising, and subtle reflections on the stove surface.In the background, soft-focus jars, fruits, and a window with natural daylightcreate a cozy atmosphere. The hand motions are smooth and rhythmic, with a realisticsense of motion blur and lighting." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --image_path ${lightx2v_path}/assets/inputs/imgs/img_2.jpg \
--save_video_path ${lightx2v_path}/save_results/lightx2v_5.mp4
+--save_result_path ${lightx2v_path}/save_results/lightx2v_5.mp4