inference_single_hf.py

from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests

from pathlib import Path
import os


current_dir = str(Path(__file__).resolve().parent)

pretrained = os.path.join(current_dir, "ckpts", "llava-v1.6-mistral-7b-hf")

processor = LlavaNextProcessor.from_pretrained(pretrained)

model = LlavaNextForConditionalGeneration.from_pretrained(pretrained, torch_dtype=torch.float16, low_cpu_mem_usage=True) 
model.to("cuda")

# prepare image and text prompt, using the appropriate prompt template
# url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
# image = Image.open(requests.get(url, stream=True).raw)
image = Image.open("./examples/image.png")

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What is shown in this image?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")

# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)

print(processor.decode(output[0], skip_special_tokens=True))