from transformers import AutoProcessor from vllm import LLM, SamplingParams from qwen_vl_utils import process_vision_info MODEL_PATH = 'Qwen/Qwen2-VL-7B-Instruct' def main(): # 指定多卡推理 llm = LLM( model=MODEL_PATH, limit_mm_per_prompt={"image": 10, "video": 10}, tensor_parallel_size=4, # 设置为你要使用的 GPU 数量 trust_remote_code=True, gpu_memory_utilization=0.95, dtype="float16", enforce_eager=True ) sampling_params = SamplingParams( temperature=0.8, top_p=0.95, max_tokens=256, stop_token_ids=[], ) messages = [ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": [ { "type": "image", "image": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png", "min_pixels": 224 * 224, "max_pixels": 1280 * 28 * 28, }, {"type": "text", "text": "What is the text in the illustrate?"}, ], }, ] processor = AutoProcessor.from_pretrained(MODEL_PATH) prompt = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) image_inputs, video_inputs = process_vision_info(messages) mm_data = {} if image_inputs is not None: mm_data["image"] = image_inputs if video_inputs is not None: mm_data["video"] = video_inputs llm_inputs = { "prompt": prompt, "multi_modal_data": mm_data, } # 多卡推理 outputs = llm.generate([llm_inputs], sampling_params=sampling_params) generated_text = outputs[0].outputs[0].text print(generated_text) if __name__ == '__main__': main()