from transformers import AutoModel, AutoTokenizer import torch import os import argparse parse = argparse.ArgumentParser() parse.add_argument('--model_name_or_path', type=str, default='deepseek-ai/DeepSeek-OCR-2') parse.add_argument('--image_file', type=str, default='doc/docstructbench_dianzishu_zhongwenzaixian-o.O-63686436.pdf_57.jpg') parse.add_argument('--output_path', type=str, default='output/image') args = parse.parse_args() if __name__ == '__main__': tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True) model = AutoModel.from_pretrained(args.model_name_or_path, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True) model = model.eval().cuda().to(torch.bfloat16) # prompt = "\nFree OCR. " prompt = "\n<|grounding|>Convert the document to markdown. " res = model.infer(tokenizer, prompt=prompt, image_file=args.image_file, output_path =args.output_path, base_size = 1024, image_size = 768, crop_mode=True, save_results = True) print("process end, result saved to ", args.output_path)