run_Transformers_imgunderstand.py

import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer
from encoder.image_tokenizer import ImageTokenizer
from decoder.smart_img_process import smart_resize_images

model_path = "inclusionAI/LLaDA2.0-Uni"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path, device_map="cuda", torch_dtype="bfloat16", trust_remote_code=True
).eval()
model.tokenizer = tokenizer

# Encode image to discrete tokens
image_tokenizer = ImageTokenizer(model_path=model_path, device="cuda")
pil_image = smart_resize_images(["./assets/understanding_example.png"])[0]
info = image_tokenizer.encode_with_info(pil_image)
image_tokens = [x + model.config.image_token_offset for x in info["token_ids"]]
_, h, w = info["grid_thw"]

# Understand the image
response = model.understand_image(
    image_tokens, h, w,
    question="Describe this image in detail.",
    steps=32, gen_length=2048,
)
print(response)