from PIL import Image import torch from transformers import AutoModelForCausalLM from transformers import AutoProcessor dtype = torch.bfloat16 model = AutoModelForCausalLM.from_pretrained("microsoft/Magma-8B", trust_remote_code=True, torch_dtype=dtype) processor = AutoProcessor.from_pretrained("microsoft/Magma-8B", trust_remote_code=True) model.to("cuda") # Inference image = Image.open("./assets/images/magma_logo.jpg").convert("RGB") convs = [ {"role": "system", "content": "You are agent that can see, talk and act."}, {"role": "user", "content": "\nWhat is the letter on the robot?"}, ] prompt = processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True) inputs = processor(images=[image], texts=prompt, return_tensors="pt") inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0) inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0) inputs = inputs.to("cuda").to(dtype) generation_args = { "max_new_tokens": 500, "temperature": 0.0, "do_sample": False, "use_cache": True, "num_beams": 1, } with torch.inference_mode(): generate_ids = model.generate(**inputs, **generation_args) generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :] response = processor.decode(generate_ids[0], skip_special_tokens=True).strip() print("response: ", response)