Add deepseek-ocr-2

5c14fb01 · chenych · 5c14fb01 · 5c14fb01 · 5c14fb01 · 5c14fb01
Commit 5c14fb01 authored Jan 27, 2026 by chenych
16 changed files
--- a/DeepSeek-OCR2-vllm/process/__pycache__/ngram_norepeat.cpython-312.pyc
+++ b/DeepSeek-OCR2-vllm/process/__pycache__/ngram_norepeat.cpython-312.pyc
--- a/DeepSeek-OCR2-vllm/process/image_process.py
+++ b/DeepSeek-OCR2-vllm/process/image_process.py
--- a/DeepSeek-OCR2-vllm/process/ngram_norepeat.py
+++ b/DeepSeek-OCR2-vllm/process/ngram_norepeat.py
+import torch
+from transformers import LogitsProcessor
+from transformers.generation.logits_process import _calc_banned_ngram_tokens
+from typing import List, Set
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, ngram_size: int, window_size: int = 100, whitelist_token_ids: set = None):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        if not isinstance(window_size, int) or window_size <= 0:
+            raise ValueError(f"`window_size` has to be a strictly positive integer, but is {window_size}")
+        self.ngram_size = ngram_size
+        self.window_size = window_size
+        self.whitelist_token_ids = whitelist_token_ids or set()
+    
+    def __call__(self, input_ids: List[int], scores: torch.FloatTensor) -> torch.FloatTensor:
+        if len(input_ids) < self.ngram_size:
+            return scores
+        
+        current_prefix = tuple(input_ids[-(self.ngram_size - 1):])
+        
+        search_start = max(0, len(input_ids) - self.window_size)
+        search_end = len(input_ids) - self.ngram_size + 1
+        
+        banned_tokens = set()
+        for i in range(search_start, search_end):
+            ngram = tuple(input_ids[i:i + self.ngram_size])
+            if ngram[:-1] == current_prefix:
+                banned_tokens.add(ngram[-1])
+        
+        banned_tokens = banned_tokens - self.whitelist_token_ids
+        
+        if banned_tokens:
+            scores = scores.clone()
+            for token in banned_tokens:
+                scores[token] = -float("inf")
+        
+        return scores
\ No newline at end of file
--- a/DeepSeek-OCR2-vllm/run_dpsk_ocr2_eval_batch.py
+++ b/DeepSeek-OCR2-vllm/run_dpsk_ocr2_eval_batch.py
+import os
+import re
+from tqdm import tqdm
+import torch
+if torch.version.cuda == '11.8':
+    os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
+os.environ['VLLM_USE_V1'] = '0'
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+
+from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, MAX_CONCURRENCY, CROP_MODE, NUM_WORKERS
+from concurrent.futures import ThreadPoolExecutor
+import glob
+from PIL import Image, ExifTags
+from deepseek_ocr2 import DeepseekOCR2ForCausalLM
+
+from vllm.model_executor.models.registry import ModelRegistry
+
+from vllm import LLM, SamplingParams
+from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
+from process.image_process import DeepseekOCR2Processor
+ModelRegistry.register_model("DeepseekOCR2ForCausalLM", DeepseekOCR2ForCausalLM)
+
+
+def correct_image_orientation(image):
+        try:
+            exif = image._getexif()
+            if exif is not None:
+                # Orientation key
+                for tag, value in ExifTags.TAGS.items():
+                    if value == 'Orientation':
+                        orientation_key = tag
+                        break
+
+                # Orientation value
+                orientation = exif.get(orientation_key, 1)
+
+                # 
+                if orientation == 3:
+                    image = image.rotate(180, expand=True)
+                elif orientation == 6:
+                    image = image.rotate(270, expand=True)
+                elif orientation == 8:
+                    image = image.rotate(90, expand=True)
+        except Exception as e:
+            print(f"EXIF error: {e}")
+
+        return image
+
+
+llm = LLM(
+    model=MODEL_PATH,
+    hf_overrides={"architectures": ["DeepseekOCR2ForCausalLM"]},
+    block_size=256,
+    enforce_eager=False,
+    trust_remote_code=True, 
+    max_model_len=8192,
+    swap_space=0,
+    max_num_seqs = MAX_CONCURRENCY,
+    tensor_parallel_size=1,
+    gpu_memory_utilization=0.7,
+)
+
+logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=40, window_size=90, whitelist_token_ids= {128821, 128822})] #window for fast；whitelist_token_ids: <td>,</td>
+
+sampling_params = SamplingParams(
+    temperature=0.0,
+    max_tokens=8192,
+    logits_processors=logits_processors,
+    skip_special_tokens=False,
+)
+
+
+
+class Colors:
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    YELLOW = '\033[33m'
+    BLUE = '\033[34m'
+    RESET = '\033[0m' 
+
+def clean_formula(text):
+
+    formula_pattern = r'\\\[(.*?)\\\]'
+    
+    def process_formula(match):
+        formula = match.group(1)
+
+        formula = re.sub(r'\\quad\s*\([^)]*\)', '', formula)
+        
+        formula = formula.strip()
+        
+        return r'\[' + formula + r'\]'
+
+    cleaned_text = re.sub(formula_pattern, process_formula, text)
+    
+    return cleaned_text
+
+def re_match(text):
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+
+
+    # mathes_image = []
+    mathes_other = []
+    for a_match in matches:
+        mathes_other.append(a_match[0])
+    return matches, mathes_other
+
+def process_single_image(image):
+    """single image"""
+    prompt_in = prompt
+    cache_item = {
+        "prompt": prompt_in,
+        "multi_modal_data": {"image": DeepseekOCR2Processor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
+    }
+    return cache_item
+
+
+if __name__ == "__main__":
+
+    # INPUT_PATH = OmniDocBench images path
+
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+
+    # print('image processing until processing prompts.....')
+
+    print(f'{Colors.RED}glob images.....{Colors.RESET}')
+
+    images_path = glob.glob(f'{INPUT_PATH}/*')
+
+    images = []
+
+    for image_path in images_path:
+        image = Image.open(image_path)
+        image = correct_image_orientation(image)
+        # image = ImageOps.exif_transpose(image)
+        images.append(image.convert('RGB'))
+
+    prompt = PROMPT
+
+    # batch_inputs = []
+
+
+    # for image in tqdm(images):
+
+    #     prompt_in = prompt
+    #     cache_list = [
+    #         {
+    #             "prompt": prompt_in,
+    #             "multi_modal_data": {"image": Image.open(image).convert('RGB')},
+    #         }
+    #     ]
+    #     batch_inputs.extend(cache_list)
+
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:  
+        batch_inputs = list(tqdm(
+            executor.map(process_single_image, images),
+            total=len(images),
+            desc="Pre-processed images"
+        ))
+
+
+    
+
+    outputs_list = llm.generate(
+        batch_inputs,
+        sampling_params=sampling_params
+    )
+
+
+    output_path = OUTPUT_PATH
+
+    os.makedirs(output_path, exist_ok=True)
+
+    for output, image in zip(outputs_list, images_path):
+
+        content = output.outputs[0].text
+        # mmd_det_path = output_path + image.split('/')[-1].replace('.jpg', '_det.md')
+
+        # with open(mmd_det_path, 'w', encoding='utf-8') as afile:
+        #     afile.write(content)
+
+        content = clean_formula(content)
+        matches_ref, mathes_other = re_match(content)
+        for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
+            content = content.replace(a_match_other, '').replace('\n\n\n\n', '\n\n').replace('\n\n\n', '\n\n')
+        
+        mmd_path = output_path + image.split('/')[-1].replace('.jpg', '.md').replace('.png', '.md')
+
+        with open(mmd_path, 'w', encoding='utf-8') as afile:
+            afile.write(content)
\ No newline at end of file
--- a/DeepSeek-OCR2-vllm/run_dpsk_ocr2_image.py
+++ b/DeepSeek-OCR2-vllm/run_dpsk_ocr2_image.py
+import asyncio
+import re
+import os
+
+
+from vllm import AsyncLLMEngine, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.registry import ModelRegistry
+import time
+from deepseek_ocr2 import DeepseekOCR2ForCausalLM
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+import numpy as np
+from tqdm import tqdm
+from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
+from process.image_process import DeepseekOCR2Processor
+from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE
+
+
+
+ModelRegistry.register_model("DeepseekOCR2ForCausalLM", DeepseekOCR2ForCausalLM)
+
+def load_image(image_path):
+
+    try:
+        image = Image.open(image_path)
+
+        corrected_image = ImageOps.exif_transpose(image)
+
+        return corrected_image
+
+    except Exception as e:
+        print(f"error: {e}")
+        try:
+            return Image.open(image_path)
+        except:
+            return None
+
+
+def re_match(text):
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+
+
+    mathes_image = []
+    mathes_other = []
+    for a_match in matches:
+        if '<|ref|>image<|/ref|>' in a_match[0]:
+            mathes_image.append(a_match[0])
+        else:
+            mathes_other.append(a_match[0])
+    return matches, mathes_image, mathes_other
+
+
+def extract_coordinates_and_label(ref_text, image_width, image_height):
+
+    try:
+        label_type = ref_text[1]
+        cor_list = eval(ref_text[2])
+    except Exception as e:
+        print(e)
+        return None
+
+    return (label_type, cor_list)
+
+
+def draw_bounding_boxes(image, refs):
+
+    image_width, image_height = image.size
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+
+    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
+    draw2 = ImageDraw.Draw(overlay)
+
+    #     except IOError:
+    font = ImageFont.load_default()
+
+    img_idx = 0
+
+    for i, ref in enumerate(refs):
+        try:
+            result = extract_coordinates_and_label(ref, image_width, image_height)
+            if result:
+                label_type, points_list = result
+
+                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
+
+                color_a = color + (20, )
+                for points in points_list:
+                    x1, y1, x2, y2 = points
+
+                    x1 = int(x1 / 999 * image_width)
+                    y1 = int(y1 / 999 * image_height)
+
+                    x2 = int(x2 / 999 * image_width)
+                    y2 = int(y2 / 999 * image_height)
+
+                    if label_type == 'image':
+                        try:
+                            cropped = image.crop((x1, y1, x2, y2))
+                            cropped.save(f"{OUTPUT_PATH}/images/{img_idx}.jpg")
+                        except Exception as e:
+                            print(e)
+                            pass
+                        img_idx += 1
+
+                    try:
+                        if label_type == 'title':
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+                        else:
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+
+                        text_x = x1
+                        text_y = max(0, y1 - 15)
+
+                        text_bbox = draw.textbbox((0, 0), label_type, font=font)
+                        text_width = text_bbox[2] - text_bbox[0]
+                        text_height = text_bbox[3] - text_bbox[1]
+                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
+                                    fill=(255, 255, 255, 30))
+
+                        draw.text((text_x, text_y), label_type, font=font, fill=color)
+                    except:
+                        pass
+        except:
+            continue
+    img_draw.paste(overlay, (0, 0), overlay)
+    return img_draw
+
+
+def process_image_with_refs(image, ref_texts):
+    result_image = draw_bounding_boxes(image, ref_texts)
+    return result_image
+
+
+
+
+async def stream_generate(image=None, prompt=''):
+
+
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        hf_overrides={"architectures": ["DeepseekOCR2ForCausalLM"]},
+        # torch_dtype=torch.bfloat16,
+        dtype="bfloat16",
+        # block_size=128,
+        max_model_len=8192,
+        enforce_eager=False,
+        trust_remote_code=True,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+    )
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=20, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td>
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=8192,
+        logits_processors=logits_processors,
+        skip_special_tokens=False,
+        # ignore_eos=False,
+
+    )
+
+    request_id = f"request-{int(time.time())}"
+
+    printed_length = 0
+
+    if image and '<image>' in prompt:
+        request = {
+            "prompt": prompt,
+            "multi_modal_data": {"image": image}
+        }
+    elif prompt:
+        request = {
+            "prompt": prompt
+        }
+    else:
+        assert False, f'prompt is none!!!'
+    async for request_output in engine.generate(
+        request, sampling_params, request_id
+    ):
+        if request_output.outputs:
+            full_text = request_output.outputs[0].text
+            new_text = full_text[printed_length:]
+            print(new_text, end='', flush=True)
+            printed_length = len(full_text)
+            final_output = full_text
+    print('\n')
+
+    return final_output
+
+
+
+
+if __name__ == "__main__":
+
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+    os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
+
+    image = load_image(INPUT_PATH).convert('RGB')
+
+
+    if '<image>' in PROMPT:
+
+        image_features = DeepseekOCR2Processor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)
+    else:
+        image_features = ''
+
+    prompt = PROMPT
+
+    result_out = asyncio.run(stream_generate(image_features, prompt))
+
+
+    save_results = 1
+
+    if save_results and '<image>' in prompt:
+        print('='*15 + 'save results:' + '='*15)
+
+        image_draw = image.copy()
+
+        outputs = result_out
+
+        with open(f'{OUTPUT_PATH}/result_ori.mmd', 'w', encoding = 'utf-8') as afile:
+            afile.write(outputs)
+
+        matches_ref, matches_images, mathes_other = re_match(outputs)
+        # print(matches_ref)
+        result = process_image_with_refs(image_draw, matches_ref)
+
+
+        for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")):
+            outputs = outputs.replace(a_match_image, f'![](images/' + str(idx) + '.jpg)\n')
+
+        for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
+            outputs = outputs.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
+
+        # if 'structural formula' in conversation[0]['content']:
+        #     outputs = '<smiles>' + outputs + '</smiles>'
+        with open(f'{OUTPUT_PATH}/result.mmd', 'w', encoding = 'utf-8') as afile:
+            afile.write(outputs)
+
+        if 'line_type' in outputs:
+            import matplotlib.pyplot as plt
+            from matplotlib.patches import Circle
+            lines = eval(outputs)['Line']['line']
+
+            line_type = eval(outputs)['Line']['line_type']
+            # print(lines)
+
+            endpoints = eval(outputs)['Line']['line_endpoint']
+
+            fig, ax = plt.subplots(figsize=(3,3), dpi=200)
+            ax.set_xlim(-15, 15)
+            ax.set_ylim(-15, 15)
+
+            for idx, line in enumerate(lines):
+                try:
+                    p0 = eval(line.split(' -- ')[0])
+                    p1 = eval(line.split(' -- ')[-1])
+
+                    if line_type[idx] == '--':
+                        ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color='k')
+                    else:
+                        ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth = 0.8, color = 'k')
+
+                    ax.scatter(p0[0], p0[1], s=5, color = 'k')
+                    ax.scatter(p1[0], p1[1], s=5, color = 'k')
+                except:
+                    pass
+
+            for endpoint in endpoints:
+
+                label = endpoint.split(': ')[0]
+                (x, y) = eval(endpoint.split(': ')[1])
+                ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
+                            fontsize=5, fontweight='light')
+
+            try:
+                if 'Circle' in eval(outputs).keys():
+                    circle_centers = eval(outputs)['Circle']['circle_center']
+                    radius = eval(outputs)['Circle']['radius']
+
+                    for center, r in zip(circle_centers, radius):
+                        center = eval(center.split(': ')[1])
+                        circle = Circle(center, radius=r, fill=False, edgecolor='black', linewidth=0.8)
+                        ax.add_patch(circle)
+            except:
+                pass
+
+
+            plt.savefig(f'{OUTPUT_PATH}/geo.jpg')
+            plt.close()
+
+        result.save(f'{OUTPUT_PATH}/result_with_boxes.jpg')
\ No newline at end of file
--- a/DeepSeek-OCR2-vllm/run_dpsk_ocr2_pdf.py
+++ b/DeepSeek-OCR2-vllm/run_dpsk_ocr2_pdf.py
+import os
+import fitz
+import img2pdf
+import io
+import re
+from tqdm import tqdm
+
+from concurrent.futures import ThreadPoolExecutor
+
+
+from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE
+
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+from deepseek_ocr2 import DeepseekOCR2ForCausalLM
+
+from vllm.model_executor.models.registry import ModelRegistry
+
+from vllm import LLM, SamplingParams
+from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
+from process.image_process import DeepseekOCR2Processor
+
+ModelRegistry.register_model("DeepseekOCR2ForCausalLM", DeepseekOCR2ForCausalLM)
+
+
+llm = LLM(
+    model=MODEL_PATH,
+    hf_overrides={"architectures": ["DeepseekOCR2ForCausalLM"]},
+    block_size=256,
+    enforce_eager=False,
+    trust_remote_code=True,
+    max_model_len=8192,
+    swap_space=0,
+    max_num_seqs=MAX_CONCURRENCY,
+    tensor_parallel_size=1,
+    gpu_memory_utilization=0.9,
+    disable_mm_preprocessor_cache=True
+)
+
+logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=20, window_size=50, whitelist_token_ids= {128821, 128822})] #window for fast；whitelist_token_ids: <td>,</td>
+
+sampling_params = SamplingParams(
+    temperature=0.0,
+    max_tokens=8192,
+    logits_processors=logits_processors,
+    skip_special_tokens=False,
+    include_stop_str_in_output=True,
+)
+
+
+class Colors:
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    YELLOW = '\033[33m'
+    BLUE = '\033[34m'
+    RESET = '\033[0m'
+
+def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
+    """
+    pdf2images
+    """
+    images = []
+
+    pdf_document = fitz.open(pdf_path)
+
+    zoom = dpi / 72.0
+    matrix = fitz.Matrix(zoom, zoom)
+
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document[page_num]
+
+        pixmap = page.get_pixmap(matrix=matrix, alpha=False)
+        Image.MAX_IMAGE_PIXELS = None
+
+        if image_format.upper() == "PNG":
+            img_data = pixmap.tobytes("png")
+            img = Image.open(io.BytesIO(img_data))
+        else:
+            img_data = pixmap.tobytes("png")
+            img = Image.open(io.BytesIO(img_data))
+            if img.mode in ('RGBA', 'LA'):
+                background = Image.new('RGB', img.size, (255, 255, 255))
+                background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
+                img = background
+
+        images.append(img)
+
+    pdf_document.close()
+    return images
+
+def pil_to_pdf_img2pdf(pil_images, output_path):
+
+    if not pil_images:
+        return
+
+    image_bytes_list = []
+
+    for img in pil_images:
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+
+        img_buffer = io.BytesIO()
+        img.save(img_buffer, format='JPEG', quality=95)
+        img_bytes = img_buffer.getvalue()
+        image_bytes_list.append(img_bytes)
+
+    try:
+        pdf_bytes = img2pdf.convert(image_bytes_list)
+        with open(output_path, "wb") as f:
+            f.write(pdf_bytes)
+
+    except Exception as e:
+        print(f"error: {e}")
+
+
+
+def re_match(text):
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+
+
+    mathes_image = []
+    mathes_other = []
+    for a_match in matches:
+        if '<|ref|>image<|/ref|>' in a_match[0]:
+            mathes_image.append(a_match[0])
+        else:
+            mathes_other.append(a_match[0])
+    return matches, mathes_image, mathes_other
+
+
+def extract_coordinates_and_label(ref_text, image_width, image_height):
+
+
+    try:
+        label_type = ref_text[1]
+        cor_list = eval(ref_text[2])
+    except Exception as e:
+        print(e)
+        return None
+
+    return (label_type, cor_list)
+
+
+def draw_bounding_boxes(image, refs, jdx):
+
+    image_width, image_height = image.size
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+
+    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
+    draw2 = ImageDraw.Draw(overlay)
+
+    #     except IOError:
+    font = ImageFont.load_default()
+
+    img_idx = 0
+
+    for i, ref in enumerate(refs):
+        try:
+            result = extract_coordinates_and_label(ref, image_width, image_height)
+            if result:
+                label_type, points_list = result
+
+                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
+
+                color_a = color + (20, )
+                for points in points_list:
+                    x1, y1, x2, y2 = points
+
+                    x1 = int(x1 / 999 * image_width)
+                    y1 = int(y1 / 999 * image_height)
+
+                    x2 = int(x2 / 999 * image_width)
+                    y2 = int(y2 / 999 * image_height)
+
+                    if label_type == 'image':
+                        try:
+                            cropped = image.crop((x1, y1, x2, y2))
+                            cropped.save(f"{OUTPUT_PATH}/images/{jdx}_{img_idx}.jpg")
+                        except Exception as e:
+                            print(e)
+                            pass
+                        img_idx += 1
+
+                    try:
+                        if label_type == 'title':
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+                        else:
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+
+                        text_x = x1
+                        text_y = max(0, y1 - 15)
+
+                        text_bbox = draw.textbbox((0, 0), label_type, font=font)
+                        text_width = text_bbox[2] - text_bbox[0]
+                        text_height = text_bbox[3] - text_bbox[1]
+                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
+                                    fill=(255, 255, 255, 30))
+
+                        draw.text((text_x, text_y), label_type, font=font, fill=color)
+                    except:
+                        pass
+        except:
+            continue
+    img_draw.paste(overlay, (0, 0), overlay)
+    return img_draw
+
+
+def process_image_with_refs(image, ref_texts, jdx):
+    result_image = draw_bounding_boxes(image, ref_texts, jdx)
+    return result_image
+
+
+def process_single_image(image):
+    """single image"""
+    prompt_in = prompt
+    cache_item = {
+        "prompt": prompt_in,
+        "multi_modal_data": {"image": DeepseekOCR2Processor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
+    }
+    return cache_item
+
+
+if __name__ == "__main__":
+
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+    os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
+
+    print(f'{Colors.RED}PDF loading .....{Colors.RESET}')
+
+
+    images = pdf_to_images_high_quality(INPUT_PATH)
+
+
+    prompt = PROMPT
+
+    # batch_inputs = []
+
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        batch_inputs = list(tqdm(
+            executor.map(process_single_image, images),
+            total=len(images),
+            desc="Pre-processed images"
+        ))
+
+
+    # for image in tqdm(images):
+
+    #     prompt_in = prompt
+    #     cache_list = [
+    #         {
+    #             "prompt": prompt_in,
+    #             "multi_modal_data": {"image": DeepseekOCR2Processor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
+    #         }
+    #     ]
+    #     batch_inputs.extend(cache_list)
+
+
+    outputs_list = llm.generate(
+        batch_inputs,
+        sampling_params=sampling_params
+    )
+
+
+    output_path = OUTPUT_PATH
+
+    os.makedirs(output_path, exist_ok=True)
+
+
+    mmd_det_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_det.mmd')
+    mmd_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('pdf', 'mmd')
+    pdf_out_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_layouts.pdf')
+    contents_det = ''
+    contents = ''
+    draw_images = []
+    jdx = 0
+    for output, img in zip(outputs_list, images):
+        content = output.outputs[0].text
+
+        if '<｜end▁of▁sentence｜>' in content: # repeat no eos
+            content = content.replace('<｜end▁of▁sentence｜>', '')
+        else:
+            if SKIP_REPEAT:
+                continue
+
+
+        page_num = f'\n<--- Page Split --->'
+
+        contents_det += content + f'\n{page_num}\n'
+
+        image_draw = img.copy()
+
+        matches_ref, matches_images, mathes_other = re_match(content)
+        # print(matches_ref)
+        result_image = process_image_with_refs(image_draw, matches_ref, jdx)
+
+
+        draw_images.append(result_image)
+
+
+        for idx, a_match_image in enumerate(matches_images):
+            content = content.replace(a_match_image, f'![](images/' + str(jdx) + '_' + str(idx) + '.jpg)\n')
+
+        for idx, a_match_other in enumerate(mathes_other):
+            content = content.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:').replace('\n\n\n\n', '\n\n').replace('\n\n\n', '\n\n')
+
+
+        contents += content + f'\n{page_num}\n'
+
+
+        jdx += 1
+
+    with open(mmd_det_path, 'w', encoding='utf-8') as afile:
+        afile.write(contents_det)
+
+    with open(mmd_path, 'w', encoding='utf-8') as afile:
+        afile.write(contents)
+
+
+    pil_to_pdf_img2pdf(draw_images, pdf_out_path)
+
--- a/DeepSeek_OCR2_paper.pdf
+++ b/DeepSeek_OCR2_paper.pdf
--- a/LICENSE.txt
+++ b/LICENSE.txt
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright (c) 2023 DeepSeek
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# DeepSeek-OCR-2
+## 论文
+[DeepSeek-OCR 2: Visual Causal Flow](DeepSeek_OCR2_paper.pdf)
+
+## 模型简介
+DeepSeek 将原本基于 CLIP 的编码器替换为轻量级语言模型（Qwen2-500M），将原有的 DeepEncoder 升级为 DeepEncoder V2。在完整保留前代能力的基础上，DeepEncoder V2 通过一种全新的架构设计，引入了因果推理能力（causal reasoning），实现了视觉编码从「固定扫描」向「语义推理」的范式转变。
+
+<div align=center>
+    <img src="./doc/fig1.png"/>
+</div>
+
+## 环境依赖
+
+| 软件 | 版本 |
+| :------: | :------: |
+| DTK | 25.04.1 |
+| python | 3.10.12 |
+| torch | 2.4.1+das.opt1.dtk25041 |
+| transformers | 4.46.3 |
+| vllm | 0.8.5.post1+das.opt4.dtk25041 |
+| flash_attn | 2.6.1+das.opt13.dtk2504 |
+
+
+推荐使用镜像: image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
+
+- 挂载地址`-v`根据实际模型情况修改
+
+```bash
+docker run -it \
+    --shm-size 60g \
+    --network=host \
+    --name deepseek-ocr-2 \
+    --privileged \
+    --device=/dev/kfd \
+    --device=/dev/dri \
+    --device=/dev/mkfd \
+    --group-add video \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    -u root \
+    -v /opt/hyhal/:/opt/hyhal/:ro \
+    -v /path/your_code_data/:/path/your_code_data/ \
+    image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724 bash
+```
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装，其它包参照requirements.txt安装：
+```
+pip install -r requirements.txt
+```
+
+## 数据集
+暂无
+
+## 训练
+暂无
+
+## 推理
+### transformers
+#### 单机推理
+> 模型地址，测试图片路径，输出路径根据实际情况修改。
+```bash
+export HIP_VISIBLE_DEVICES=0
+python DeepSeek-OCR2-hf/run_dpsk_ocr2.py --model_name_or_path=deepseek-ai/DeepSeek-OCR-2 --image_file=doc/docstructbench_dianzishu_zhongwenzaixian-o.O-63686436.pdf_57.jpg --output_path=output/image
+```
+
+### vllm
+> 模型地址，测试图片路径，输出路径请根据实际情况在`DeepSeek-OCR2-vllm/config.py`中修改。
+```bash
+export VLLM_USE_V1=0
+export HIP_VISIBLE_DEVICES=0
+# image：流式输出
+python DeepSeek-OCR2-vllm/run_dpsk_ocr2_image.py
+
+# pdf
+python DeepSeek-OCR2-vllm/run_dpsk_ocr2_image.py
+```
+
+## 效果展示
+<div align=center>
+    <img src="./doc/docstructbench_dianzishu_zhongwenzaixian-o.O-63686436.pdf_57_result_with_boxes.jpg"/>
+</div>
+
+### 精度
+DCU与GPU精度一致，推理框架：vllm。
+
+## 预训练权重
+| 模型名称  | 权重大小  | DCU型号  | 最低卡数需求 |下载地址|
+|:-----:|:----------:|:----------:|:---------------------:|:----------:|
+| DeepSeek-OCR-2 | 3B | BW1000 | 1 | [ModelScope](https://modelscope.cn/models/deepseek-ai/DeepSeek-OCR-2) |
+
+## 源码仓库及问题反馈
+- https://developer.sourcefind.cn/codes/modelzoo/deepseek-ocr-2_pytorch
+
+## 参考资料
+- https://github.com/deepseek-ai/DeepSeek-OCR-2
--- a/doc/DeepSeek_OCR2_paper_layouts.pdf
+++ b/doc/DeepSeek_OCR2_paper_layouts.pdf
--- a/doc/docstructbench_dianzishu_zhongwenzaixian-o.O-63686436.pdf_57.jpg
+++ b/doc/docstructbench_dianzishu_zhongwenzaixian-o.O-63686436.pdf_57.jpg
--- a/doc/docstructbench_dianzishu_zhongwenzaixian-o.O-63686436.pdf_57_result_with_boxes.jpg
+++ b/doc/docstructbench_dianzishu_zhongwenzaixian-o.O-63686436.pdf_57_result_with_boxes.jpg
--- a/doc/fig1.png
+++ b/doc/fig1.png
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=2003
+# 模型名称
+modelName=deepseek-ocr-2_pytorch
+# 模型描述
+modelDescription=DeepSeek-OCR 2 通过引入 DeepEncoder V2 架构，实现了视觉编码从「固定扫描」向「语义推理」的范式转变！
+# 运行过程
+processType=推理
+# 算法类别
+appCategory=OCR
+# 框架类型
+frameType=pytorch
+# 加速卡类型
+accelerateType=BW1000
--- a/requirements.txt
+++ b/requirements.txt
+transformers==4.46.3
+tokenizers==0.20.3
+PyMuPDF
+img2pdf
+einops
+easydict
+addict 
+Pillow
+numpy