Fix params and Update README.

d890d8fe · chenych · 055b6aa1 · d890d8fe · d890d8fe · d890d8fe
Commit d890d8fe authored Oct 21, 2025 by chenych
20 changed files
--- a/DeepSeek-OCR-master/DeepSeek-OCR-hf/run_dpsk_ocr.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-hf/run_dpsk_ocr.py
@@ -9,7 +9,7 @@ os.environ["HIP_VISIBLE_DEVICES"] = '0'

 parse = argparse.ArgumentParser()
 parse.add_argument('--model_name_or_path', type=str, default='deepseek-ai/DeepSeek-OCR')
-parse.add_argument('--image_file', type=str, default='./doc/test.jpg')
+parse.add_argument('--image_file', type=str, default='./doc/test.png')
 parse.add_argument('--output_path', type=str, default='./output/')
 args = parse.parse_args()


--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py
@@ -11,18 +11,18 @@ CROP_MODE = True
 MIN_CROPS= 2
 MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
 MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
-NUM_WORKERS = 64 # image pre-process (resize/padding) workers 
+NUM_WORKERS = 64 # image pre-process (resize/padding) workers
 PRINT_NUM_VIS_TOKENS = False
 SKIP_REPEAT = True
 MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path

 # TODO: change INPUT_PATH
-# .pdf: run_dpsk_ocr_pdf.py; 
-# .jpg, .png, .jpeg: run_dpsk_ocr_image.py; 
+# .pdf: run_dpsk_ocr_pdf.py;
+# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
 # Omnidocbench images path: run_dpsk_ocr_eval_batch.py

-INPUT_PATH = '' 
-OUTPUT_PATH = ''
+INPUT_PATH = './doc/test.png'
+OUTPUT_PATH = './output'

 PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
 # PROMPT = '<image>\nFree OCR.'

--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/__init__.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/__init__.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/build_linear.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/build_linear.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/clip_sdpa.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/clip_sdpa.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepseek_ocr.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepseek_ocr.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/__init__.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/__init__.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/ngram_norepeat.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/ngram_norepeat.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_eval_batch.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_eval_batch.py
--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
 import asyncio
 import re
 import os
+import argparse

-import torch
-if torch.version.cuda == '11.8':
-    os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
-
-os.environ['VLLM_USE_V1'] = '0'
-os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+os.environ["HIP_VISIBLE_DEVICES"] = '0'

 from vllm import AsyncLLMEngine, SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -21,19 +17,17 @@ from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
 from process.image_process import DeepseekOCRProcessor
 from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE

-
-
 ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)

 def load_image(image_path):

    try:
        image = Image.open(image_path)
-        
+
        corrected_image = ImageOps.exif_transpose(image)

        return corrected_image
-        
+
    except Exception as e:
        print(f"error: {e}")
        try:
@@ -78,18 +72,18 @@ def draw_bounding_boxes(image, refs):

    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
    draw2 = ImageDraw.Draw(overlay)
-    
+
    #     except IOError:
    font = ImageFont.load_default()

    img_idx = 0
-    
+
    for i, ref in enumerate(refs):
        try:
            result = extract_coordinates_and_label(ref, image_width, image_height)
            if result:
                label_type, points_list = result
-                
+
                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))

                color_a = color + (20, )
@@ -110,7 +104,7 @@ def draw_bounding_boxes(image, refs):
                            print(e)
                            pass
                        img_idx += 1
-                        
+
                    try:
                        if label_type == 'title':
                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
@@ -121,13 +115,13 @@ def draw_bounding_boxes(image, refs):

                        text_x = x1
                        text_y = max(0, y1 - 15)
-                            
+
                        text_bbox = draw.textbbox((0, 0), label_type, font=font)
                        text_width = text_bbox[2] - text_bbox[0]
                        text_height = text_bbox[3] - text_bbox[1]
-                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height], 
+                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
                                    fill=(255, 255, 255, 30))
-                        
+
                        draw.text((text_x, text_y), label_type, font=font, fill=color)
                    except:
                        pass
@@ -136,30 +130,26 @@ def draw_bounding_boxes(image, refs):
    img_draw.paste(overlay, (0, 0), overlay)
    return img_draw

-
 def process_image_with_refs(image, ref_texts):
    result_image = draw_bounding_boxes(image, ref_texts)
    return result_image

-
-
-
 async def stream_generate(image=None, prompt=''):


    engine_args = AsyncEngineArgs(
        model=MODEL_PATH,
        hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
-        block_size=256,
+        block_size=64,
        max_model_len=8192,
        enforce_eager=False,
-        trust_remote_code=True,  
+        trust_remote_code=True,
        tensor_parallel_size=1,
        gpu_memory_utilization=0.75,
    )
    engine = AsyncLLMEngine.from_engine_args(engine_args)
-    
-    logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td> 
+
+    logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td>

    sampling_params = SamplingParams(
        temperature=0.0,
@@ -167,12 +157,12 @@ async def stream_generate(image=None, prompt=''):
        logits_processors=logits_processors,
        skip_special_tokens=False,
        # ignore_eos=False,
-        
+
    )
-    
+
    request_id = f"request-{int(time.time())}"

-    printed_length = 0  
+    printed_length = 0

    if image and '<image>' in prompt:
        request = {
@@ -194,7 +184,7 @@ async def stream_generate(image=None, prompt=''):
            print(new_text, end='', flush=True)
            printed_length = len(full_text)
            final_output = full_text
-    print('\n') 
+    print('\n')

    return final_output

@@ -208,7 +198,7 @@ if __name__ == "__main__":

    image = load_image(INPUT_PATH).convert('RGB')

-    
+
    if '<image>' in PROMPT:

        image_features = DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)
@@ -281,9 +271,9 @@ if __name__ == "__main__":

                label = endpoint.split(': ')[0]
                (x, y) = eval(endpoint.split(': ')[1])
-                ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points', 
+                ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
                            fontsize=5, fontweight='light')
-            
+
            try:
                if 'Circle' in eval(outputs).keys():
                    circle_centers = eval(outputs)['Circle']['circle_center']

--- a/DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
+++ b/DeepSeek-OCR-master/DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
@@ -3,17 +3,12 @@ import fitz
 import img2pdf
 import io
 import re
+
 from tqdm import tqdm
-import torch
 from concurrent.futures import ThreadPoolExecutor
- 

-if torch.version.cuda == '11.8':
-    os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
-os.environ['VLLM_USE_V1'] = '0'
 os.environ["CUDA_VISIBLE_DEVICES"] = '0'

-
 from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE

 from PIL import Image, ImageDraw, ImageFont
@@ -32,9 +27,9 @@ ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
 llm = LLM(
    model=MODEL_PATH,
    hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
-    block_size=256,
+    block_size=64,
    enforce_eager=False,
-    trust_remote_code=True, 
+    trust_remote_code=True,
    max_model_len=8192,
    swap_space=0,
    max_num_seqs=MAX_CONCURRENCY,
@@ -59,19 +54,19 @@ class Colors:
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
-    RESET = '\033[0m' 
+    RESET = '\033[0m'

 def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
    """
    pdf2images
    """
    images = []
-    
+
    pdf_document = fitz.open(pdf_path)
-    
+
    zoom = dpi / 72.0
    matrix = fitz.Matrix(zoom, zoom)
-    
+
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]

@@ -88,9 +83,9 @@ def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
                background = Image.new('RGB', img.size, (255, 255, 255))
                background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
                img = background
-        
+
        images.append(img)
-    
+
    pdf_document.close()
    return images

@@ -98,18 +93,18 @@ def pil_to_pdf_img2pdf(pil_images, output_path):

    if not pil_images:
        return
-    
+
    image_bytes_list = []
-    
+
    for img in pil_images:
        if img.mode != 'RGB':
            img = img.convert('RGB')
-        
+
        img_buffer = io.BytesIO()
        img.save(img_buffer, format='JPEG', quality=95)
        img_bytes = img_buffer.getvalue()
        image_bytes_list.append(img_bytes)
-    
+
    try:
        pdf_bytes = img2pdf.convert(image_bytes_list)
        with open(output_path, "wb") as f:
@@ -156,18 +151,18 @@ def draw_bounding_boxes(image, refs, jdx):

    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
    draw2 = ImageDraw.Draw(overlay)
-    
+
    #     except IOError:
    font = ImageFont.load_default()

    img_idx = 0
-    
+
    for i, ref in enumerate(refs):
        try:
            result = extract_coordinates_and_label(ref, image_width, image_height)
            if result:
                label_type, points_list = result
-                
+
                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))

                color_a = color + (20, )
@@ -188,7 +183,7 @@ def draw_bounding_boxes(image, refs, jdx):
                            print(e)
                            pass
                        img_idx += 1
-                        
+
                    try:
                        if label_type == 'title':
                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
@@ -199,13 +194,13 @@ def draw_bounding_boxes(image, refs, jdx):

                        text_x = x1
                        text_y = max(0, y1 - 15)
-                            
+
                        text_bbox = draw.textbbox((0, 0), label_type, font=font)
                        text_width = text_bbox[2] - text_bbox[0]
                        text_height = text_bbox[3] - text_bbox[1]
-                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height], 
+                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
                                    fill=(255, 255, 255, 30))
-                        
+
                        draw.text((text_x, text_y), label_type, font=font, fill=color)
                    except:
                        pass
@@ -234,7 +229,7 @@ if __name__ == "__main__":

    os.makedirs(OUTPUT_PATH, exist_ok=True)
    os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
-    
+
    print(f'{Colors.RED}PDF loading .....{Colors.RESET}')


@@ -245,7 +240,7 @@ if __name__ == "__main__":

    # batch_inputs = []

-    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:  
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        batch_inputs = list(tqdm(
            executor.map(process_single_image, images),
            total=len(images),
@@ -292,7 +287,7 @@ if __name__ == "__main__":
            if SKIP_REPEAT:
                continue

-        
+
        page_num = f'\n<--- Page Split --->'

        contents_det += content + f'\n{page_num}\n'

--- a/README.md
+++ b/README.md
@@ -16,10 +16,6 @@ DeepSeek-OCR 的能力范围包括：
 - 多语言处理（中英文混合识别）
 - 物体定位（grounding 功能支持）

-<div align=center>
-    <img src="./doc/xxx.png"/>
-</div>
-
 ## 环境配置
 ### 硬件需求
 DCU型号：K100AI，节点数量：1台，卡数：1张。
@@ -28,7 +24,7 @@ DCU型号：K100AI，节点数量：1台，卡数：1张。

 ### Docker（方法一）
 ```bash
-docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
+docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724

 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash

@@ -52,7 +48,7 @@ DTK: 25.04.1
 python: 3.10.12
 torch: 2.5.1+das.opt1.dtk25041
 transformers: 4.46.3
-vllm: 0.9.2
+vllm: 0.8.5
 ```
 `Tips：以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其它非深度学习库参照requirements.txt安装：
 ```bash
@@ -67,25 +63,25 @@ pip install -r requirements.txt

 ## 推理
 ### transformers
-模型地址，测试图片路径，输出路径根据实际情况修改。
+> 模型地址，测试图片路径，输出路径根据实际情况修改。
 ```bash
-cd DeepSeek-OCR-master/DeepSeek-OCR-hf
-python run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.jpg --output_path=./output
+python DeepSeek-OCR-hf/run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.png --output_path=./output
 ```

 ### vllm
+> 模型地址，测试图片路径，输出路径请根据实际情况在`DeepSeek-OCR-vllm/config.py`中修改。
 ```bash
-cd DeepSeek-OCR-master/DeepSeek-OCR-vllm
-# image: streaming output
-python run_dpsk_ocr_image.py
+export VLLM_USE_V1=0
+# image：流式输出
+python DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
 # pdf
-python run_dpsk_ocr_pdf.py
+python DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
 ```

 ## result

 <div align=center>
-    <img src="./doc/xxx.png"/>
+    <img src="./doc/result_with_boxes_vllm.jpg"/>
 </div>

 ### 精度

--- a/doc/DeepSeek_OCR_paper_layouts.pdf
+++ b/doc/DeepSeek_OCR_paper_layouts.pdf
--- a/doc/result_with_boxes_hf.jpg
+++ b/doc/result_with_boxes_hf.jpg
--- a/doc/result_with_boxes_vllm.jpg
+++ b/doc/result_with_boxes_vllm.jpg
--- a/doc/test.png
+++ b/doc/test.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
-FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
\ No newline at end of file
+FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
 # 模型唯一标识
-modelCode=205
+modelCode=1781
 # 模型名称
-modelName=PaddleOCR_paddle_onnxruntime
+modelName=deepseek-ocr_pytorch
 # 模型描述
-modelDescription=paddleocr_paddle_onnxruntime是一个实现字符检测和识别的模型。
+modelDescription=DeepSeek 推出了全新的视觉文本压缩模型 DeepSeek-OCR。
 # 应用场景
-appScenario=推理,训练,OCR,制造,金融,交通,教育,医疗
+appScenario=推理,OCR,制造,金融,交通,教育,医疗
 # 框架类型
-frameType=paddle,onnxruntime
+frameType=pytorch,vllm
 # 加速卡类型
 accelerateType=K100AI
\ No newline at end of file