from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration import torch from PIL import Image import requests from pathlib import Path import os current_dir = str(Path(__file__).resolve().parent) pretrained = os.path.join(current_dir, "ckpts", "llava-v1.6-mistral-7b-hf") processor = LlavaNextProcessor.from_pretrained(pretrained) model = LlavaNextForConditionalGeneration.from_pretrained(pretrained, torch_dtype=torch.float16, low_cpu_mem_usage=True) model.to("cuda") # prepare image and text prompt, using the appropriate prompt template # url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" # image = Image.open(requests.get(url, stream=True).raw) image = Image.open("./examples/image.png") # Define a chat history and use `apply_chat_template` to get correctly formatted prompt # Each value in "content" has to be a list of dicts with types ("text", "image") conversation = [ { "role": "user", "content": [ {"type": "text", "text": "What is shown in this image?"}, {"type": "image"}, ], }, ] prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda") # autoregressively complete prompt output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True))