import torch from modelscope import AutoModelForCausalLM, AutoTokenizer from encoder.image_tokenizer import ImageTokenizer from decoder.smart_img_process import smart_resize_images model_path = "inclusionAI/LLaDA2.0-Uni" tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, device_map="cuda", torch_dtype="bfloat16", trust_remote_code=True ).eval() model.tokenizer = tokenizer # Encode image to discrete tokens image_tokenizer = ImageTokenizer(model_path=model_path, device="cuda") pil_image = smart_resize_images(["./assets/understanding_example.png"])[0] info = image_tokenizer.encode_with_info(pil_image) image_tokens = [x + model.config.image_token_offset for x in info["token_ids"]] _, h, w = info["grid_thw"] # Understand the image response = model.understand_image( image_tokens, h, w, question="Describe this image in detail.", steps=32, gen_length=2048, ) print(response)