inference-mov.py

import torch
import requests
from PIL import Image
from modelscope import AutoModelForCausalLM
#from moviepy.editor import VideoFileClip  # pip install moviepy==1.0.3

from moviepy import VideoFileClip

# Thinking mode & budget
enable_thinking = True
enable_thinking_budget = True  # Only effective if enable_thinking is True.

# Total tokens for thinking + answer. Ensure: max_new_tokens > thinking_budget + 25
max_new_tokens = 2048
thinking_budget = 1024

model = AutoModelForCausalLM.from_pretrained(
    "AIDC-AI/Ovis2.6-30B-A3B",
    dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto"
)


# Video inference

video_file = "./doc/mm.mp4"
num_frames = 8

with VideoFileClip(video_file) as clip:
    total_frames = int(clip.fps * clip.duration)
    indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
    frames = [Image.fromarray(clip.get_frame(t)) for t in (idx / clip.fps for idx in indices)]

messages = [{"role": "user", "content": [
    {"type": "video", "video": frames},
    {"type": "text", "text": "Describe this video in detail."},
]}]

input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, max_pixels=896*896)
input_ids = input_ids.cuda()
pixel_values = pixel_values.cuda().to(model.dtype) if pixel_values is not None else None
grid_thws = grid_thws.cuda() if grid_thws is not None else None

with torch.no_grad():
    outputs = model.generate(inputs=input_ids, pixel_values=pixel_values, grid_thws=grid_thws,
                             max_new_tokens=1024, do_sample=True,
                             eos_token_id=model.text_tokenizer.eos_token_id,
                             pad_token_id=model.text_tokenizer.pad_token_id)
print(model.text_tokenizer.decode(outputs[0], skip_special_tokens=True))