import argparse from typing import List, Optional, Tuple import PIL.Image import gradio as gr import moviepy.editor as mp import numpy as np import torch from ovis.model.modeling_ovis import Ovis model: Ovis = None def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]: """Extract a fixed number of frames from the video file.""" if not video_path: return None try: with mp.VideoFileClip(video_path) as clip: duration = clip.duration if duration is None or clip.fps is None or duration <= 0 or clip.fps <= 0: print(f"Warning: Unable to process video {video_path}. Invalid duration or fps.") return None total_possible_frames = int(duration * clip.fps) num_to_extract = min(n_frames, total_possible_frames) if num_to_extract <= 0: print(f"Warning: Cannot extract frames from {video_path}. Computed extractable frames is zero.") return None frames = [] timestamps = np.linspace(0, duration, num_to_extract, endpoint=True) for t in timestamps: frame_np = clip.get_frame(t) frames.append(PIL.Image.fromarray(frame_np)) print(f"Successfully extracted {len(frames)} frames from {video_path}.") return frames except Exception as e: print(f"Error processing video {video_path}: {e}") return None def run_single_model( image_input: Optional[PIL.Image.Image], video_input: Optional[str], prompt: str, do_sample: bool, max_new_tokens: int, enable_thinking: bool ) -> str: """Run single model inference.""" if not prompt and not image_input and not video_input: gr.Warning("Please enter a prompt, upload an image, or upload a video.") return "" # Prepare vision inputs images = [image_input] if image_input else None video_frames = load_video_frames(video_input) videos = [video_frames] if video_frames else None # Construct full prompt with placeholders visual_placeholders = ('\n' * len(images) if images else "") + ('