"Initial commit"

441126fe · luopl · 441126fe · 441126fe · 441126fe · 441126fe
Commit 441126fe authored Dec 03, 2025 by luopl
20 changed files
--- a/doc/performance/Ovis2_5_OC.png
+++ b/doc/performance/Ovis2_5_OC.png
--- a/doc/performance/Ovis2_5_performance.png
+++ b/doc/performance/Ovis2_5_performance.png
--- a/doc/performance/Ovis2_5_reason.png
+++ b/doc/performance/Ovis2_5_reason.png
--- a/doc/result.png
+++ b/doc/result.png
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=1855
+# 模型名称
+modelName=Ovis2.5_pytorch
+# 模型描述
+modelDescription=Ovis2.5专为原生分辨率视觉感知和增强的多模态推理而设计，在图像推理、视频理解和接地基准测试中表现出领先的性能，展现了强大的通用多模态能力。
+# 运行过程
+processType=推理
+# 算法类别
+appCategory=多模态
+# 框架类型
+frameType=pytorch
+# 加速卡类型
+accelerateType=BW1000
--- a/ovis/__init__.py
+++ b/ovis/__init__.py
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# for torch==2.4.0
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning, module="torch.utils.checkpoint", lineno=1399)
--- a/ovis/model/__init__.py
+++ b/ovis/model/__init__.py
+from transformers import AutoConfig, AutoModel
+from .vit.modeling_siglip2_navit import Siglip2NavitModel
+from .vit.configuration_siglip2_navit import Siglip2NavitConfig
+
+AutoConfig.register('siglip2_navit', Siglip2NavitConfig)
+AutoModel.register(Siglip2NavitConfig, Siglip2NavitModel)
+
--- a/ovis/model/configuration_ovis.py
+++ b/ovis/model/configuration_ovis.py
+from typing import Union, Optional
+
+from transformers import PretrainedConfig, Qwen3Config
+
+from . import Siglip2NavitConfig
+
+
+class OvisConfig(PretrainedConfig):
+    model_type = "ovis"
+    sub_configs = dict(llm_config=Qwen3Config, vit_config=Siglip2NavitConfig)
+
+    def __init__(self,
+        llm_config: Optional[Union[Qwen3Config, dict]] = None,
+        vit_config: Optional[Union[Siglip2NavitConfig, dict]] = None,
+        visual_vocab_size=65536,
+        hidden_size=None,
+        conversation_formatter_class=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        if isinstance(llm_config, dict):
+            llm_config = Qwen3Config(**llm_config)
+        self.llm_config = llm_config
+        if isinstance(vit_config, dict):
+            vit_config = Siglip2NavitConfig(**vit_config)
+        self.vit_config = vit_config
+        self.visual_vocab_size = visual_vocab_size
+        self.hidden_size = hidden_size
+        self.conversation_formatter_class = conversation_formatter_class
+        if kwargs.get('attn_implementation'):
+            self.llm_config._attn_implementation = kwargs['attn_implementation']
+            self.vit_config._attn_implementation = kwargs['attn_implementation']
--- a/ovis/model/conversation_formatter.py
+++ b/ovis/model/conversation_formatter.py
+import copy
+from abc import ABC, abstractmethod
+from typing import List, Dict
+
+from ovis.util.constants import IMAGE_TOKEN_ID, IGNORE_ID, IMAGE_TOKEN, VIDEO_TOKEN_ID, VIDEO_TOKEN
+
+
+class ConversationFormatter(ABC):
+    support_tokenizer_types = None
+
+    def __init__(self, tokenizer):
+        tokenizer_type = type(tokenizer).__name__
+        assert tokenizer_type in self.support_tokenizer_types, \
+            f'Invalid tokenizer type, expected one from `{self.support_tokenizer_types}`, but got `{tokenizer_type}`'
+        self.tokenizer = tokenizer
+        self.image_token = IMAGE_TOKEN
+        self.image_token_id = IMAGE_TOKEN_ID
+        self.ignore_id = IGNORE_ID
+        self.im_end = None
+        self.video_token = VIDEO_TOKEN
+        self.video_token_id = VIDEO_TOKEN_ID
+
+    def _tokenize_with_image_symbol(self, text):
+        if text.find(self.video_token) != -1:
+            token = self.video_token
+            token_id = self.video_token_id
+        else:
+            token = self.image_token
+            token_id = self.image_token_id
+
+        text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
+                       text.split(token)]
+        token_ids = []
+        num_chuck = len(text_chunks)
+        for i, chunk in enumerate(text_chunks):
+            token_ids.extend(chunk)
+            if i < num_chuck - 1:
+                token_ids.append(token_id)
+        return token_ids
+
+    @abstractmethod
+    def format(self, conversations: List[Dict], generation_preface=None, enable_thinking=False):
+        pass
+
+    @abstractmethod
+    def format_query(self, query, generation_preface=""):
+        pass
+
+class Qwen3ConversationFormatter(ConversationFormatter):
+    support_tokenizer_types = ['QWenTokenizer', 'Qwen2TokenizerFast']
+
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        self.from2role = {
+            "system": "<|im_start|>system\n",
+            "human": "<|im_start|>user\n",
+            "gpt": "<|im_start|>assistant\n",
+            "ignored_gpt": "<|im_start|>assistant\n",
+        }
+        
+        self.im_end = "<|im_end|>\n"
+        self.empty_think = "<think>\n\n</think>\n\n"
+        self.gpt_token_nums = None
+
+    def _initialize_gpt_token_nums(self) -> Dict[str, int]:
+        think_prefix = self.from2role["gpt"]
+        think_num = len(
+            self.tokenizer(think_prefix, add_special_tokens=False).input_ids
+        )
+        no_think_prefix = self.from2role["gpt"] + self.empty_think
+        no_think_num = len(
+            self.tokenizer(no_think_prefix, add_special_tokens=False).input_ids
+        )
+        return {'think': think_num, 'no_think': no_think_num}
+
+    # enable_thinking is deprecated
+    def format(self, conversations: List[Dict], generation_preface=None, enable_thinking=False):
+        conversations = copy.deepcopy(conversations)
+
+        if generation_preface is not None:
+            conversations.append({
+                "from": "gpt",
+                "value": generation_preface
+            })
+
+        prompt = ""
+        input_ids = []
+        labels = []
+        num_conversation = len(conversations)
+        for i, conversation in enumerate(conversations):
+            frm = conversation["from"]
+            role = self.from2role[frm]
+            message = conversation["value"]
+            has_thinking = '<think>' in message and '</think>' in message
+            if frm == 'gpt' and not has_thinking and generation_preface is None:
+                text = role + self.empty_think + message
+            else:
+                text = role + message
+            
+            if self.gpt_token_nums is None:
+                self.gpt_token_nums = self._initialize_gpt_token_nums()
+            gpt_token_num = self.gpt_token_nums['think'] if has_thinking else self.gpt_token_nums['no_think']
+            
+            if i < num_conversation - 1 or generation_preface is None:
+                text += self.im_end
+            prompt += text
+            token_ids = self._tokenize_with_image_symbol(text)
+            input_ids.extend(token_ids)
+            label_ids = [self.ignore_id] * len(token_ids)
+            if frm == "gpt" and generation_preface is None:
+                # learning `\n` following `im_end` is meaningless, so the last `\n` token is ignored in label
+                label_ids[gpt_token_num:-1] = token_ids[gpt_token_num:-1]
+            labels.extend(label_ids)
+
+        assert self._tokenize_with_image_symbol(prompt) == input_ids
+        assert len(input_ids) == len(labels)
+
+        if conversations[-1]['from'] == "gpt" and generation_preface is None:
+            # remove the last `\n` following `im_end` in input_ids
+            input_ids.pop()
+            labels.pop()
+
+        return prompt, input_ids, labels
+
+    def format_query(self, query, generation_preface="", enable_thinking=False):
+        prompt, input_ids, _ = self.format([{
+            "from": "human",
+            "value": query
+        }], generation_preface=generation_preface, enable_thinking=enable_thinking)
+
+        return prompt, input_ids
--- a/ovis/model/modeling_ovis.py
+++ b/ovis/model/modeling_ovis.py
--- a/ovis/model/vit/configuration_siglip2_navit.py
+++ b/ovis/model/vit/configuration_siglip2_navit.py
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Siglip2NavitConfig(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`Siglip2Navit`].
+
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+
+    model_type: str = "siglip2_navit"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 4096,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 16,
+        num_channels: int = 3,
+        num_patches: int = -1,
+        image_size: int = 512,
+        patch_size: int = 16,
+        hidden_act: str="gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        attention_dropout: float = 0.0,
+        hidden_stride: int = 2,
+        window_size: int = 112,
+        fullatt_block_indexes: Optional[list] = None,
+        temporal_patch_size: int = 1,
+        preserve_original_pe: bool = True,
+        use_rope: bool = True,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_stride = hidden_stride
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.temporal_patch_size = temporal_patch_size
+        self.preserve_original_pe = preserve_original_pe
+        self.use_rope = use_rope
+
+__all__ = ["Siglip2NavitConfig"]
\ No newline at end of file
--- a/ovis/model/vit/modeling_siglip2_navit.py
+++ b/ovis/model/vit/modeling_siglip2_navit.py
--- a/ovis/serve/infer_basic_demo.py
+++ b/ovis/serve/infer_basic_demo.py
+import torch
+from PIL import Image
+from ovis.model.modeling_ovis import Ovis
+
+# If you need video support, make sure moviepy is installed first:
+#   pip install moviepy==1.0.3
+try:
+    from moviepy.editor import VideoFileClip  # type: ignore
+    _HAS_MOVIEPY = True
+except Exception:
+    _HAS_MOVIEPY = False
+
+
+def run_single_image_example(model: Ovis, image_path: str) -> None:
+    """
+    Run an inference example with a single image input.
+    """
+    print("--- 1) Single-image example ---")
+    images = [Image.open(image_path).convert("RGB")]
+    prompt = "<image>\nDescribe this image in detail."
+
+    print(f"Prompt:\n{prompt}")
+
+    response, _, _ = model.chat(
+        prompt=prompt,
+        images=images,
+        min_pixels=448 * 448,
+        max_pixels=1792 * 1792,
+        videos=None,
+        do_sample=True,
+        max_new_tokens=1024,
+    )
+    print(f"\nResponse:\n{response}")
+
+
+def run_multi_image_example(model: Ovis, image_paths: list) -> None:
+    """
+    Run an inference example with multiple image inputs.
+    """
+    print("--- 2) Multi-image example ---")
+    images = [Image.open(p).convert("RGB") for p in image_paths]
+    prompt = "<image>\n<image>\n<image>\nWhat is the relationship between the third image and the first two?"
+
+    print(f"Prompt:\n{prompt}")
+
+    response, _, _ = model.chat(
+        prompt=prompt,
+        images=images,
+        min_pixels=448 * 448,
+        max_pixels=896 * 896,
+        videos=None,
+        do_sample=True,
+        max_new_tokens=1024,
+    )
+    print(f"\nResponse:\n{response}")
+
+
+def run_video_example(model: Ovis, video_path: str, num_frames: int = 8) -> None:
+    """
+    Run an inference example with a video input.
+    """
+    if not _HAS_MOVIEPY:
+        raise ImportError(
+            "moviepy is not installed. Install it with `pip install moviepy==1.0.3` to use video examples."
+        )
+
+    print("--- 3) Video example ---")
+    with VideoFileClip(video_path) as clip:
+        total_frames = int(clip.fps * clip.duration)
+        indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
+        frames = [
+            Image.fromarray(clip.get_frame(t)) for t in (index / clip.fps for index in indices)
+        ]
+
+    videos = [frames]
+    prompt = "<video>\nDescribe this video in detail."
+
+    print(f"Prompt:\n{prompt}")
+
+    response, _, _ = model.chat(
+        prompt=prompt,
+        images=None,
+        videos=videos,
+        min_pixels=448 * 448,
+        max_pixels=896 * 896,
+        do_sample=True,
+        max_new_tokens=1024,
+    )
+    print(f"\nResponse:\n{response}")
+
+
+def run_text_only_example(model: Ovis) -> None:
+    """
+    Run an inference example with text-only input.
+    """
+    print("--- 4) Text-only example ---")
+    prompt = "Hi, please introduce Huangshan (Yellow Mountain) in Chinese."
+
+    print(f"Prompt:\n{prompt}")
+
+    response, _, _ = model.chat(
+        prompt=prompt,
+        images=None,
+        videos=None,
+        do_sample=True,
+        max_new_tokens=1024,
+    )
+    print(f"\nResponse:\n{response}")
+
+
+if __name__ == "__main__":
+    # --- 1) Load model ---
+    model_path = "AIDC-AI/Ovis2.5-9B"
+
+    print("Loading model, please wait...")
+    model = (
+        Ovis.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="cuda:0",
+        ).eval()
+    )
+    print("Model loaded.")
+    print("\n========================================\n")
+
+    # --- 2) Define file paths (anonymized placeholders) ---
+    # Replace the following with your own paths
+    single_image_file = "/path/to/image1.jpg"
+    multi_image_files = [
+        "/path/to/image1.jpg",
+        "/path/to/image2.jpg",
+        "/path/to/image3.png",
+    ]
+    video_file = "/path/to/video1.mp4"
+
+    # --- 3) Run examples ---
+    run_single_image_example(model, single_image_file)
+    print("\n========================================\n")
+
+    run_multi_image_example(model, multi_image_files)
+    print("\n========================================\n")
+
+    run_video_example(model, video_file)
+    print("\n========================================\n")
+
+    run_text_only_example(model)
+    print("\n========================================\n")
--- a/ovis/serve/infer_think_demo.py
+++ b/ovis/serve/infer_think_demo.py
+import torch
+from PIL import Image
+from ovis.model.modeling_ovis import Ovis
+
+MODEL_PATH = "AIDC-AI/Ovis2.5-9B"
+
+# Enable reflective reasoning mode (thinking mode)
+enable_thinking = True
+
+# Total tokens = thinking phase + response
+max_new_tokens = 3072
+
+# thinking_budget: upper bound of tokens reserved for the "thinking phase"
+# - If provided, the model will stop thinking once this budget is reached,
+#   then switch to generating the final response.
+# - If omitted when calling .chat(), it is equivalent to "not set",
+#   and the model may use all max_new_tokens for thinking.
+thinking_budget = 2048
+
+# Load model
+model = Ovis.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+).eval()
+
+prompt = "<image>\nDescribe this image in detail."
+images = [Image.open("/path/to/image1.jpg")]
+
+# Run chat
+response, thinking, _ = model.chat(
+    prompt=prompt,
+    images=images,
+    history=None,
+    do_sample=True,
+    max_new_tokens=max_new_tokens,
+    enable_thinking=enable_thinking,
+    thinking_budget=thinking_budget,  # omit this arg => unlimited thinking
+)
+
+# Print results
+if enable_thinking and thinking:
+    print("=== Thinking ===")
+    print(thinking)
+    print("\n=== Response ===")
+    print(response)
+else:
+    print("Response:", response)
--- a/ovis/serve/web_ui.py
+++ b/ovis/serve/web_ui.py
+import argparse
+from typing import List, Optional, Tuple
+
+import PIL.Image
+import gradio as gr
+import moviepy.editor as mp
+import numpy as np
+import torch
+from ovis.model.modeling_ovis import Ovis 
+
+model: Ovis = None
+
+def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]:
+    """Extract a fixed number of frames from the video file."""
+    if not video_path:
+        return None
+    try:
+        with mp.VideoFileClip(video_path) as clip:
+            duration = clip.duration
+            if duration is None or clip.fps is None or duration <= 0 or clip.fps <= 0:
+                print(f"Warning: Unable to process video {video_path}. Invalid duration or fps.")
+                return None
+            
+            total_possible_frames = int(duration * clip.fps)
+            num_to_extract = min(n_frames, total_possible_frames)
+
+            if num_to_extract <= 0:
+                print(f"Warning: Cannot extract frames from {video_path}. Computed extractable frames is zero.")
+                return None
+            
+            frames = []
+            timestamps = np.linspace(0, duration, num_to_extract, endpoint=True)
+            for t in timestamps:
+                frame_np = clip.get_frame(t)
+                frames.append(PIL.Image.fromarray(frame_np))
+        print(f"Successfully extracted {len(frames)} frames from {video_path}.")
+        return frames
+    except Exception as e:
+        print(f"Error processing video {video_path}: {e}")
+        return None
+
+def run_single_model(
+    image_input: Optional[PIL.Image.Image], 
+    video_input: Optional[str], 
+    prompt: str,
+    do_sample: bool, 
+    max_new_tokens: int, 
+    enable_thinking: bool
+) -> str:
+    """Run single model inference."""
+    if not prompt and not image_input and not video_input:
+        gr.Warning("Please enter a prompt, upload an image, or upload a video.")
+        return ""
+
+    # Prepare vision inputs
+    images = [image_input] if image_input else None
+    video_frames = load_video_frames(video_input)
+    videos = [video_frames] if video_frames else None
+    
+    # Construct full prompt with placeholders
+    visual_placeholders = ('<image>\n' * len(images) if images else "") + ('<video>\n' if videos else "")
+    full_prompt = visual_placeholders + prompt
+    
+    # Call model chat method
+    response, thinking, _ = model.chat(
+        prompt=full_prompt, 
+        history=None,  # Always start a new conversation
+        images=images, 
+        videos=videos,
+        do_sample=do_sample, 
+        max_new_tokens=max_new_tokens, 
+        enable_thinking=enable_thinking,
+    )
+    
+    # Format output
+    if enable_thinking and thinking:
+        return f"**Thinking:**\n```text\n{thinking}\n```\n\n**Response:**\n{response}"
+    return response
+
+def toggle_media_input(choice: str) -> Tuple[gr.update, gr.update]:
+    """Toggle visibility of image and video input components."""
+    if choice == "Image":
+        return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
+    else:
+        return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
+
+def clear_interface() -> Tuple[str, None, None, str, str]:
+    """Reset all inputs and outputs."""
+    return "", None, None, "", "Image"
+
+def start_generation() -> Tuple[gr.update, gr.update, gr.update]:
+    """Update UI status when generation starts."""
+    return (
+        gr.update(value="⏳ Generating...", interactive=False),
+        gr.update(interactive=False),
+        gr.update(value="⏳ Model is generating...")
+    )
+
+def finish_generation() -> Tuple[gr.update, gr.update]:
+    """Restore UI status after generation ends."""
+    return gr.update(value="Generate", interactive=True), gr.update(interactive=True)
+
+def build_demo(model_path: str, gpu: int):
+    """Build single-model Gradio demo interface."""
+    global model
+    device = f"cuda:{gpu}"
+    print(f"Loading model {model_path} to device {device}...")
+    model = Ovis.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map=device).eval()
+    print("Model loaded successfully.")
+
+    custom_css = "#output_md .prose { font-size: 18px !important; }"
+    with gr.Blocks(theme=gr.themes.Default(), css=custom_css) as demo:
+        gr.Markdown("# Multimodal Large Language Model Interface")
+        gr.Markdown(f"Running on **GPU {gpu}**. Each submission starts a new conversation.")
+        
+        with gr.Row():
+            # Left column - inputs
+            with gr.Column(scale=1):
+                gr.Markdown("### Inputs")
+                input_type_radio = gr.Radio(
+                    choices=["Image", "Video"], value="Image", label="Select Input Type"
+                )
+                image_input = gr.Image(label="Image Input", type="pil", visible=True, height=400)
+                video_input = gr.Video(label="Video Input", visible=False)
+                prompt_input = gr.Textbox(
+                    label="Prompt", placeholder="Enter your prompt here... (Press Enter to submit)", lines=3
+                )
+                with gr.Accordion("Generation Settings", open=True):
+                    do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=False)
+                    max_new_tokens = gr.Slider(
+                        minimum=32, maximum=2048, value=1024, step=32, label="Max New Tokens"
+                    )
+                    enable_thinking = gr.Checkbox(label="Deep Thinking", value=False)
+                
+                with gr.Row():
+                    clear_btn = gr.Button("Clear", variant="secondary", scale=1)
+                    generate_btn = gr.Button("Generate", variant="primary", scale=2)
+
+            # Right column - output
+            with gr.Column(scale=2):
+                model_name = model_path.split('/')[-1]
+                gr.Markdown(f"### Model Output\n`{model_name}`")
+                output_display = gr.Markdown(elem_id="output_md")
+        
+        # Event handlers
+        input_type_radio.change(
+            fn=toggle_media_input, 
+            inputs=input_type_radio, 
+            outputs=[image_input, video_input]
+        )
+        
+        run_inputs = [image_input, video_input, prompt_input, do_sample, max_new_tokens, enable_thinking]
+
+        generate_btn.click(
+            fn=start_generation, 
+            outputs=[generate_btn, clear_btn, output_display]
+        ).then(
+            fn=run_single_model,
+            inputs=run_inputs,
+            outputs=[output_display]
+        ).then(
+            fn=finish_generation,
+            outputs=[generate_btn, clear_btn]
+        )
+
+        prompt_input.submit(
+            fn=start_generation, 
+            outputs=[generate_btn, clear_btn, output_display]
+        ).then(
+            fn=run_single_model,
+            inputs=run_inputs,
+            outputs=[output_display]
+        ).then(
+            fn=finish_generation,
+            outputs=[generate_btn, clear_btn]
+        )
+        
+        clear_btn.click(
+            fn=clear_interface,
+            outputs=[output_display, image_input, video_input, prompt_input, input_type_radio]
+        ).then(
+            fn=toggle_media_input, 
+            inputs=input_type_radio, 
+            outputs=[image_input, video_input]
+        )
+        
+    return demo
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Gradio interface for Ovis.")
+    parser.add_argument("--model-path", type=str)
+    parser.add_argument("--gpu", type=int, default=0, help="GPU index to run the model.")
+    parser.add_argument("--port", type=int, default=9901, help="Port to run the Gradio service.")
+    parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Server name for Gradio app.")
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = parse_args()
+    
+    demo = build_demo(
+        model_path=args.model_path,
+        gpu=args.gpu
+    )
+    
+    print(f"Launching Gradio app at http://{args.server_name}:{args.port}")
+    demo.queue().launch(
+        server_name=args.server_name,
+        server_port=args.port,
+        share=False,
+        ssl_verify=False
+    )
--- a/ovis/train/__init__.py
+++ b/ovis/train/__init__.py
--- a/ovis/train/arguments.py
+++ b/ovis/train/arguments.py
+from dataclasses import dataclass, field
+from typing import Optional
+
+import transformers
+
+from ovis.util.utils import rankN_print
+
+
+@dataclass
+class ModelArguments:
+    llm_name_or_path: Optional[str] = field(default=None)
+    vit_name_or_path: Optional[str] = field(default=None)
+    visual_vocab_size: int = field(default=65536)
+    conversation_formatter_class: str = field(default=None)
+    attn_implementation: Optional[str] = field(default=None)
+    accepts_loss_kwargs: bool = field(default=True)
+    vit_hidden_stride: int = field(default=2)
+    vit_window_size: int = field(default=112)
+    vit_temporal_patch_size: int = field(default=1)
+    vit_fullatt_block_indexes: Optional[str] = field(default=None)
+    vit_preserve_original_pe: Optional[bool] = field(default=True)
+    vit_use_rope: Optional[bool] = field(default=True)
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    data_info_version: Optional[str] = field(default=None)
+    data_name: Optional[str] = field(default=None)  # a|b|c
+    data_type: Optional[str] = field(default=None)  # caption, conversation
+    ovis_pretrained_path: Optional[str] = field(default=None)
+    stage: Optional[int] = field(default=None)
+    train_modules: Optional[str] = field(default=None)
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    save_safetensors: bool = field(default=True)
+    monitor_step: int = field(default=100)
+    model_init_seed: int = field(default=0)
+    multimodal_max_length: int = field(default=4096)
+    text_max_length: Optional[int] = field(default=4096)
+    min_frames: int = field(default=8)
+    max_frames: int = field(default=8)
+    overall_ratio: Optional[str] = field(default=None)
+    mix_data_name: Optional[str] = field(default=None)
+    mix_ratio: Optional[float] = field(default=None)
+    min_lr_rate: Optional[float] = field(default=None)
+    single_image_min_pixels: int = field(default=448*448)
+    single_image_max_pixels: int = field(default=1792*1344)
+    multiple_image_min_pixels: int = field(default=448*448)
+    multiple_image_max_pixels: int = field(default=448*448)
+    video_min_pixels: int = field(default=448*448)
+    video_max_pixels: int = field(default=448*448)
+
+    def __post_init__(self):
+        if self.min_lr_rate is not None:
+            self.lr_scheduler_kwargs = {
+                "min_lr_rate": self.min_lr_rate
+            }
+        if self.gradient_checkpointing:
+            self.gradient_checkpointing_kwargs = {"use_reentrant": False}
+        if self.stage < 3:
+            self.save_safetensors = False
+        super().__post_init__()
+        assert self.model_init_seed != self.seed, "`model_init_seed` should be different from `seed`"
\ No newline at end of file
--- a/ovis/train/callback.py
+++ b/ovis/train/callback.py
+import gc
+import time
+
+import deepspeed
+import torch
+import torch.distributed as dist
+from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
+
+from ovis.util.constants import END_LINE, BEGIN_LINE
+from ovis.util.utils import rankN_print
+
+
+class TuneTauCallback(TrainerCallback):
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        visual_tokenizer = kwargs['model'].get_visual_tokenizer()
+        current_step = state.global_step
+        max_step = state.max_steps
+        ratio = current_step / max_step
+        visual_tokenizer.config.tau = args.visual_max_tau - (args.visual_max_tau - args.visual_min_tau) * ratio
+
+
+class MonitorCallback(TrainerCallback):
+    def _monitoring(self, model, step):
+        with torch.no_grad():
+            with deepspeed.zero.GatheredParameters(model.get_monitor_tensors().values()):
+                for k, v in model.get_monitor_tensors().items():
+                    rankN_print(BEGIN_LINE)
+                    rankN_print(f'{k} @ step {step} with sum: {v.sum().item()} and content: ')
+                    rankN_print(v)
+                    rankN_print(END_LINE)
+
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        model = kwargs['model']
+        step = state.global_step
+        if step % args.monitor_step == 0 or step == 10:  # monitor at step 10 for fast check
+            self._monitoring(model, step)
+
+    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        model = kwargs['model']
+        step = state.global_step
+        self._monitoring(model, step)
--- a/ovis/train/dataset/__init__.py
+++ b/ovis/train/dataset/__init__.py