from transformers import AutoProcessor from vllm import LLM, SamplingParams from qwen_vl_utils import process_vision_info MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct" llm = LLM( model=MODEL_PATH, limit_mm_per_prompt={"image": 10, "video": 10}, ) sampling_params = SamplingParams( temperature=0.1, top_p=0.001, repetition_penalty=1.05, max_tokens=256, stop_token_ids=[], ) messages = [ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": [ { "type": "image", "image": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png", "min_pixels": 224 * 224, "max_pixels": 1280 * 28 * 28, }, {"type": "text", "text": "What is the text in the illustrate?"}, ], }, ] # For video input, you can pass following values instead: # "type": "video", # "video": "