added DeepSeek OCR API by liushengtong

6ad287f7 · liuxu3 · 80c11a03 · 6ad287f7 · 6ad287f7 · 6ad287f7
Commit 6ad287f7 authored Feb 28, 2026 by liuxu3
20 changed files
--- a/DeepSeek-OCR-vllm/process/ngram_norepeat.py
+++ b/DeepSeek-OCR-vllm/process/ngram_norepeat.py
+import torch
+from transformers import LogitsProcessor
+from transformers.generation.logits_process import _calc_banned_ngram_tokens
+from typing import List, Set
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, ngram_size: int, window_size: int = 100, whitelist_token_ids: set = None):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        if not isinstance(window_size, int) or window_size <= 0:
+            raise ValueError(f"`window_size` has to be a strictly positive integer, but is {window_size}")
+        self.ngram_size = ngram_size
+        self.window_size = window_size
+        self.whitelist_token_ids = whitelist_token_ids or set()
+    
+    def __call__(self, input_ids: List[int], scores: torch.FloatTensor) -> torch.FloatTensor:
+        if len(input_ids) < self.ngram_size:
+            return scores
+        
+        current_prefix = tuple(input_ids[-(self.ngram_size - 1):])
+        
+        search_start = max(0, len(input_ids) - self.window_size)
+        search_end = len(input_ids) - self.ngram_size + 1
+        
+        banned_tokens = set()
+        for i in range(search_start, search_end):
+            ngram = tuple(input_ids[i:i + self.ngram_size])
+            if ngram[:-1] == current_prefix:
+                banned_tokens.add(ngram[-1])
+        
+        banned_tokens = banned_tokens - self.whitelist_token_ids
+        
+        if banned_tokens:
+            scores = scores.clone()
+            for token in banned_tokens:
+                scores[token] = -float("inf")
+        
+        return scores
\ No newline at end of file
--- a/DeepSeek-OCR-vllm/quick_start.sh
+++ b/DeepSeek-OCR-vllm/quick_start.sh
+#!/bin/bash
+# =============================================================================
+# DeepSeek OCR vLLM 快速启动脚本
+# =============================================================================
+
+set -e
+
+# 颜色定义
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PYTHON_PATH="/usr/bin/python3"
+
+print_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
+print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
+print_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
+print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
+
+# 读取 .env 配置
+if [ -f "${SCRIPT_DIR}/.env" ]; then
+    export $(grep -v '^#' "${SCRIPT_DIR}/.env" | xargs)
+else
+    print_error "未找到 .env 配置文件"
+    print_info "请复制 .env.example 为 .env 并修改配置"
+    exit 1
+fi
+
+# 检查 Python 解释器路径
+if [ -n "$PYTHON_PATH" ] && [ -f "$PYTHON_PATH" ]; then
+    # 使用指定的 Python 路径（miniconda 方式）
+    PYTHON_CMD="$PYTHON_PATH"
+    print_info "使用指定 Python: $PYTHON_PATH"
+else
+    # 使用 conda 环境
+    if ! command -v conda &> /dev/null; then
+        print_error "未找到 conda 命令，请安装 conda 或设置 PYTHON_PATH"
+        exit 1
+    fi
+    PYTHON_CMD="conda run -n ${CONDA_ENV_NAME} python"
+    print_info "使用 conda 环境: ${CONDA_ENV_NAME}"
+fi
+
+print_info "DeepSeekOCR vLLM 快速启动..."
+
+# 检查环境和依赖
+print_info "检查环境和依赖..."
+
+if [ -n "$PYTHON_PATH" ]; then
+    # 使用指定 Python 路径（miniconda 方式）
+    if ! $PYTHON_CMD -c "import vllm" &>/dev/null; then
+        print_info "安装依赖包..."
+        $PYTHON_CMD -m pip install -r "${SCRIPT_DIR}/requirements.txt"
+    fi
+else
+    # 使用 conda 环境
+    if ! conda env list | grep -q "^${CONDA_ENV_NAME}\s"; then
+        print_info "创建conda环境..."
+        conda create -n "${CONDA_ENV_NAME}" python="${PYTHON_VERSION}" -y
+    fi
+
+    if ! conda run -n "${CONDA_ENV_NAME}" python -c "import vllm" &>/dev/null; then
+        print_info "安装依赖包..."
+        conda run -n "${CONDA_ENV_NAME}" pip install -r "${SCRIPT_DIR}/requirements.txt"
+    fi
+fi
+
+# 检查模型路径
+if [ ! -d "$MODEL_PATH" ]; then
+    print_warning "模型路径不存在: $MODEL_PATH"
+    print_info "请修改 .env 文件中的 MODEL_PATH 变量"
+    exit 1
+fi
+
+# 创建日志目录
+mkdir -p "${SCRIPT_DIR}/logs"
+
+# 启动服务
+print_info "启动服务 (端口 ${PORT}, GPU ${GPU_ID})..."
+LOG_FILE="${SCRIPT_DIR}/logs/deepseek_ocr_server_${PORT}_$(date +%Y%m%d_%H%M%S).log"
+
+# 启动服务
+if [ -n "$PYTHON_PATH" ]; then
+    # 使用指定 Python 路径（miniconda 方式）
+    cat > /tmp/quick_start.py << EOF
+import subprocess
+import sys
+import os
+
+cmd = [
+    sys.executable, "deepseek_ocr_server.py",
+    "--model-path", "${MODEL_PATH}",
+    "--gpu-id", "${GPU_ID}",
+    "--port", "${PORT}",
+    "--host", "${HOST}",
+    "--cpu-workers", "${CPU_WORKERS}"
+]
+
+print(f"[INFO] 启动命令: {' '.join(cmd)}")
+print(f"[INFO] 日志文件: ${LOG_FILE}")
+
+with open("${LOG_FILE}", 'w') as log_file:
+    process = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, text=True)
+
+print(f"[SUCCESS] 服务已启动 (PID: {process.pid})")
+print(f"[INFO] API文档: http://${HOST}:${PORT}/docs")
+print(f"[INFO] 健康检查: curl http://${HOST}:${PORT}/health")
+EOF
+
+    $PYTHON_CMD /tmp/quick_start.py
+else
+    # 使用 conda 环境
+    cat > /tmp/quick_start.py << EOF
+import subprocess
+import sys
+import os
+
+cmd = [
+    sys.executable, "deepseek_ocr_server.py",
+    "--model-path", "${MODEL_PATH}",
+    "--gpu-id", "${GPU_ID}",
+    "--port", "${PORT}",
+    "--host", "${HOST}",
+    "--cpu-workers", "${CPU_WORKERS}"
+]
+
+print(f"[INFO] 启动命令: {' '.join(cmd)}")
+print(f"[INFO] 日志文件: ${LOG_FILE}")
+
+with open("${LOG_FILE}", 'w') as log_file:
+    process = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, text=True)
+
+print(f"[SUCCESS] 服务已启动 (PID: {process.pid})")
+print(f"[INFO] API文档: http://${HOST}:${PORT}/docs")
+print(f"[INFO] 健康检查: curl http://${HOST}:${PORT}/health")
+EOF
+
+    conda run -n "${CONDA_ENV_NAME}" python /tmp/quick_start.py
+fi
+
+rm -f /tmp/quick_start.py
+
+print_success "启动完成!"
+print_info "使用以下命令监控:"
+echo "  tail -f ${LOG_FILE}"
+echo "  curl http://${HOST}:${PORT}/health"
--- a/DeepSeek-OCR-vllm/run_dpsk_ocr_eval_batch.py
+++ b/DeepSeek-OCR-vllm/run_dpsk_ocr_eval_batch.py
+import os
+import re
+from tqdm import tqdm
+import torch
+if torch.version.cuda == '11.8':
+    os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
+os.environ['VLLM_USE_V1'] = '0'
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+
+from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, MAX_CONCURRENCY, CROP_MODE, NUM_WORKERS
+from concurrent.futures import ThreadPoolExecutor
+import glob
+from PIL import Image
+from deepseek_ocr import DeepseekOCRForCausalLM
+
+from vllm.model_executor.models.registry import ModelRegistry
+
+from vllm import LLM, SamplingParams
+from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
+from process.image_process import DeepseekOCRProcessor
+ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
+
+
+llm = LLM(
+    model=MODEL_PATH,
+    hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
+    block_size=256,
+    enforce_eager=False,
+    trust_remote_code=True, 
+    max_model_len=8192,
+    swap_space=0,
+    max_num_seqs = MAX_CONCURRENCY,
+    tensor_parallel_size=1,
+    gpu_memory_utilization=0.9,
+)
+
+logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=40, window_size=90, whitelist_token_ids= {128821, 128822})] #window for fast；whitelist_token_ids: <td>,</td>
+
+sampling_params = SamplingParams(
+    temperature=0.0,
+    max_tokens=8192,
+    logits_processors=logits_processors,
+    skip_special_tokens=False,
+)
+
+class Colors:
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    YELLOW = '\033[33m'
+    BLUE = '\033[34m'
+    RESET = '\033[0m' 
+
+def clean_formula(text):
+
+    formula_pattern = r'\\\[(.*?)\\\]'
+    
+    def process_formula(match):
+        formula = match.group(1)
+
+        formula = re.sub(r'\\quad\s*\([^)]*\)', '', formula)
+        
+        formula = formula.strip()
+        
+        return r'\[' + formula + r'\]'
+
+    cleaned_text = re.sub(formula_pattern, process_formula, text)
+    
+    return cleaned_text
+
+def re_match(text):
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+
+
+    # mathes_image = []
+    mathes_other = []
+    for a_match in matches:
+        mathes_other.append(a_match[0])
+    return matches, mathes_other
+
+def process_single_image(image):
+    """single image"""
+    prompt_in = prompt
+    cache_item = {
+        "prompt": prompt_in,
+        "multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
+    }
+    return cache_item
+
+
+if __name__ == "__main__":
+
+    # INPUT_PATH = OmniDocBench images path
+
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+
+    # print('image processing until processing prompts.....')
+
+    print(f'{Colors.RED}glob images.....{Colors.RESET}')
+
+    images_path = glob.glob(f'{INPUT_PATH}/*')
+
+    images = []
+
+    for image_path in images_path:
+        image = Image.open(image_path).convert('RGB')
+        images.append(image)
+
+    prompt = PROMPT
+
+    # batch_inputs = []
+
+
+    # for image in tqdm(images):
+
+    #     prompt_in = prompt
+    #     cache_list = [
+    #         {
+    #             "prompt": prompt_in,
+    #             "multi_modal_data": {"image": Image.open(image).convert('RGB')},
+    #         }
+    #     ]
+    #     batch_inputs.extend(cache_list)
+
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:  
+        batch_inputs = list(tqdm(
+            executor.map(process_single_image, images),
+            total=len(images),
+            desc="Pre-processed images"
+        ))
+
+
+    
+
+    outputs_list = llm.generate(
+        batch_inputs,
+        sampling_params=sampling_params
+    )
+
+
+    output_path = OUTPUT_PATH
+
+    os.makedirs(output_path, exist_ok=True)
+
+    for output, image in zip(outputs_list, images_path):
+
+        content = output.outputs[0].text
+        mmd_det_path = output_path + image.split('/')[-1].replace('.jpg', '_det.md')
+
+        with open(mmd_det_path, 'w', encoding='utf-8') as afile:
+            afile.write(content)
+
+        content = clean_formula(content)
+        matches_ref, mathes_other = re_match(content)
+        for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
+            content = content.replace(a_match_other, '').replace('\n\n\n\n', '\n\n').replace('\n\n\n', '\n\n').replace('<center>', '').replace('</center>', '')
+        
+        mmd_path = output_path + image.split('/')[-1].replace('.jpg', '.md')
+
+        with open(mmd_path, 'w', encoding='utf-8') as afile:
+            afile.write(content)
--- a/DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
+++ b/DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
+import asyncio
+import re
+import os
+
+from vllm import AsyncLLMEngine, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.registry import ModelRegistry
+import time
+from deepseek_ocr import DeepseekOCRForCausalLM
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+import numpy as np
+from tqdm import tqdm
+from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
+from process.image_process import DeepseekOCRProcessor
+from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE
+
+ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
+
+def load_image(image_path):
+    try:
+        image = Image.open(image_path)
+        corrected_image = ImageOps.exif_transpose(image)
+        return corrected_image
+    except Exception as e:
+        print(f"error: {e}")
+        try:
+            return Image.open(image_path)
+        except:
+            return None
+
+def re_match(text):
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+
+
+    mathes_image = []
+    mathes_other = []
+    for a_match in matches:
+        if '<|ref|>image<|/ref|>' in a_match[0]:
+            mathes_image.append(a_match[0])
+        else:
+            mathes_other.append(a_match[0])
+    return matches, mathes_image, mathes_other
+
+def extract_coordinates_and_label(ref_text, image_width, image_height):
+    try:
+        label_type = ref_text[1]
+        cor_list = eval(ref_text[2])
+    except Exception as e:
+        print(e)
+        return None
+
+    return (label_type, cor_list)
+
+def draw_bounding_boxes(image, refs):
+
+    image_width, image_height = image.size
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+
+    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
+    draw2 = ImageDraw.Draw(overlay)
+
+    #     except IOError:
+    font = ImageFont.load_default()
+
+    img_idx = 0
+
+    for i, ref in enumerate(refs):
+        try:
+            result = extract_coordinates_and_label(ref, image_width, image_height)
+            if result:
+                label_type, points_list = result
+
+                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
+
+                color_a = color + (20, )
+                for points in points_list:
+                    x1, y1, x2, y2 = points
+
+                    x1 = int(x1 / 999 * image_width)
+                    y1 = int(y1 / 999 * image_height)
+
+                    x2 = int(x2 / 999 * image_width)
+                    y2 = int(y2 / 999 * image_height)
+
+                    if label_type == 'image':
+                        try:
+                            cropped = image.crop((x1, y1, x2, y2))
+                            cropped.save(f"{OUTPUT_PATH}/images/{img_idx}.jpg")
+                        except Exception as e:
+                            print(e)
+                            pass
+                        img_idx += 1
+
+                    try:
+                        if label_type == 'title':
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+                        else:
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+
+                        text_x = x1
+                        text_y = max(0, y1 - 15)
+
+                        text_bbox = draw.textbbox((0, 0), label_type, font=font)
+                        text_width = text_bbox[2] - text_bbox[0]
+                        text_height = text_bbox[3] - text_bbox[1]
+                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
+                                    fill=(255, 255, 255, 30))
+
+                        draw.text((text_x, text_y), label_type, font=font, fill=color)
+                    except:
+                        pass
+        except:
+            continue
+    img_draw.paste(overlay, (0, 0), overlay)
+    return img_draw
+
+def process_image_with_refs(image, ref_texts):
+    result_image = draw_bounding_boxes(image, ref_texts)
+    return result_image
+
+async def stream_generate(image=None, prompt=''):
+
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
+        block_size=64,
+        max_model_len=8192,
+        enforce_eager=False,
+        trust_remote_code=True,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+    )
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td>
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=8192,
+        logits_processors=logits_processors,
+        skip_special_tokens=False,
+        # ignore_eos=False,
+
+    )
+
+    request_id = f"request-{int(time.time())}"
+
+    printed_length = 0
+
+    if image and '<image>' in prompt:
+        request = {
+            "prompt": prompt,
+            "multi_modal_data": {"image": image}
+        }
+    elif prompt:
+        request = {
+            "prompt": prompt
+        }
+    else:
+        assert False, f'prompt is none!!!'
+    async for request_output in engine.generate(
+        request, sampling_params, request_id
+    ):
+        if request_output.outputs:
+            full_text = request_output.outputs[0].text
+            new_text = full_text[printed_length:]
+            print(new_text, end='', flush=True)
+            printed_length = len(full_text)
+            final_output = full_text
+    print('\n')
+
+    return final_output
+
+
+if __name__ == "__main__":
+
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+    os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
+
+    image = load_image(INPUT_PATH).convert('RGB')
+
+
+    if '<image>' in PROMPT:
+
+        image_features = DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)
+    else:
+        image_features = ''
+
+    prompt = PROMPT
+
+    result_out = asyncio.run(stream_generate(image_features, prompt))
+    save_results = 1
+
+    if save_results and '<image>' in prompt:
+        print('='*15 + 'save results:' + '='*15)
+
+        image_draw = image.copy()
+
+        outputs = result_out
+
+        with open(f'{OUTPUT_PATH}/result_ori.mmd', 'w', encoding = 'utf-8') as afile:
+            afile.write(outputs)
+
+        matches_ref, matches_images, mathes_other = re_match(outputs)
+        # print(matches_ref)
+        result = process_image_with_refs(image_draw, matches_ref)
+
+
+        for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")):
+            outputs = outputs.replace(a_match_image, f'![](images/' + str(idx) + '.jpg)\n')
+
+        for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
+            outputs = outputs.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
+
+        # if 'structural formula' in conversation[0]['content']:
+        #     outputs = '<smiles>' + outputs + '</smiles>'
+        with open(f'{OUTPUT_PATH}/result.mmd', 'w', encoding = 'utf-8') as afile:
+            afile.write(outputs)
+
+        if 'line_type' in outputs:
+            import matplotlib.pyplot as plt
+            from matplotlib.patches import Circle
+            lines = eval(outputs)['Line']['line']
+
+            line_type = eval(outputs)['Line']['line_type']
+            # print(lines)
+
+            endpoints = eval(outputs)['Line']['line_endpoint']
+
+            fig, ax = plt.subplots(figsize=(3,3), dpi=200)
+            ax.set_xlim(-15, 15)
+            ax.set_ylim(-15, 15)
+
+            for idx, line in enumerate(lines):
+                try:
+                    p0 = eval(line.split(' -- ')[0])
+                    p1 = eval(line.split(' -- ')[-1])
+
+                    if line_type[idx] == '--':
+                        ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color='k')
+                    else:
+                        ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth = 0.8, color = 'k')
+
+                    ax.scatter(p0[0], p0[1], s=5, color = 'k')
+                    ax.scatter(p1[0], p1[1], s=5, color = 'k')
+                except:
+                    pass
+
+            for endpoint in endpoints:
+
+                label = endpoint.split(': ')[0]
+                (x, y) = eval(endpoint.split(': ')[1])
+                ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
+                            fontsize=5, fontweight='light')
+
+            try:
+                if 'Circle' in eval(outputs).keys():
+                    circle_centers = eval(outputs)['Circle']['circle_center']
+                    radius = eval(outputs)['Circle']['radius']
+
+                    for center, r in zip(circle_centers, radius):
+                        center = eval(center.split(': ')[1])
+                        circle = Circle(center, radius=r, fill=False, edgecolor='black', linewidth=0.8)
+                        ax.add_patch(circle)
+            except:
+                pass
+
+
+            plt.savefig(f'{OUTPUT_PATH}/geo.jpg')
+            plt.close()
+
+        result.save(f'{OUTPUT_PATH}/result_with_boxes.jpg')
--- a/DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
+++ b/DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
+import os
+import fitz
+import img2pdf
+import io
+import re
+import numpy as np
+
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor
+from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE
+from PIL import Image, ImageDraw, ImageFont
+from deepseek_ocr import DeepseekOCRForCausalLM
+from vllm.model_executor.models.registry import ModelRegistry
+from vllm import LLM, SamplingParams
+from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
+from process.image_process import DeepseekOCRProcessor
+
+ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
+
+llm = LLM(
+    model=MODEL_PATH,
+    hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
+    block_size=64,
+    enforce_eager=False,
+    trust_remote_code=True,
+    max_model_len=8192,
+    swap_space=0,
+    max_num_seqs=MAX_CONCURRENCY,
+    tensor_parallel_size=1,
+    gpu_memory_utilization=0.9,
+    disable_mm_preprocessor_cache=True
+)
+
+logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=20, window_size=50, whitelist_token_ids= {128821, 128822})] #window for fast；whitelist_token_ids: <td>,</td>
+
+sampling_params = SamplingParams(
+    temperature=0.0,
+    max_tokens=8192,
+    logits_processors=logits_processors,
+    skip_special_tokens=False,
+    include_stop_str_in_output=True,
+)
+
+
+class Colors:
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    YELLOW = '\033[33m'
+    BLUE = '\033[34m'
+    RESET = '\033[0m'
+
+def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
+    """
+    pdf2images
+    """
+    images = []
+
+    pdf_document = fitz.open(pdf_path)
+
+    zoom = dpi / 72.0
+    matrix = fitz.Matrix(zoom, zoom)
+
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document[page_num]
+
+        pixmap = page.get_pixmap(matrix=matrix, alpha=False)
+        Image.MAX_IMAGE_PIXELS = None
+
+        if image_format.upper() == "PNG":
+            img_data = pixmap.tobytes("png")
+            img = Image.open(io.BytesIO(img_data))
+        else:
+            img_data = pixmap.tobytes("png")
+            img = Image.open(io.BytesIO(img_data))
+            if img.mode in ('RGBA', 'LA'):
+                background = Image.new('RGB', img.size, (255, 255, 255))
+                background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
+                img = background
+
+        images.append(img)
+
+    pdf_document.close()
+    return images
+
+def pil_to_pdf_img2pdf(pil_images, output_path):
+    if not pil_images:
+        return
+
+    image_bytes_list = []
+
+    for img in pil_images:
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+
+        img_buffer = io.BytesIO()
+        img.save(img_buffer, format='JPEG', quality=95)
+        img_bytes = img_buffer.getvalue()
+        image_bytes_list.append(img_bytes)
+
+    try:
+        pdf_bytes = img2pdf.convert(image_bytes_list)
+        with open(output_path, "wb") as f:
+            f.write(pdf_bytes)
+
+    except Exception as e:
+        print(f"error: {e}")
+
+def re_match(text):
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+
+    mathes_image = []
+    mathes_other = []
+    for a_match in matches:
+        if '<|ref|>image<|/ref|>' in a_match[0]:
+            mathes_image.append(a_match[0])
+        else:
+            mathes_other.append(a_match[0])
+    return matches, mathes_image, mathes_other
+
+def extract_coordinates_and_label(ref_text, image_width, image_height):
+    try:
+        label_type = ref_text[1]
+        cor_list = eval(ref_text[2])
+    except Exception as e:
+        print(e)
+        return None
+
+    return (label_type, cor_list)
+
+def draw_bounding_boxes(image, refs, jdx):
+
+    image_width, image_height = image.size
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+
+    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
+    draw2 = ImageDraw.Draw(overlay)
+
+    #     except IOError:
+    font = ImageFont.load_default()
+
+    img_idx = 0
+
+    for i, ref in enumerate(refs):
+        try:
+            result = extract_coordinates_and_label(ref, image_width, image_height)
+            if result:
+                label_type, points_list = result
+
+                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
+
+                color_a = color + (20, )
+                for points in points_list:
+                    x1, y1, x2, y2 = points
+
+                    x1 = int(x1 / 999 * image_width)
+                    y1 = int(y1 / 999 * image_height)
+
+                    x2 = int(x2 / 999 * image_width)
+                    y2 = int(y2 / 999 * image_height)
+
+                    if label_type == 'image':
+                        try:
+                            cropped = image.crop((x1, y1, x2, y2))
+                            cropped.save(f"{OUTPUT_PATH}/images/{jdx}_{img_idx}.jpg")
+                        except Exception as e:
+                            print(e)
+                            pass
+                        img_idx += 1
+
+                    try:
+                        if label_type == 'title':
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+                        else:
+                            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
+
+                        text_x = x1
+                        text_y = max(0, y1 - 15)
+
+                        text_bbox = draw.textbbox((0, 0), label_type, font=font)
+                        text_width = text_bbox[2] - text_bbox[0]
+                        text_height = text_bbox[3] - text_bbox[1]
+                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
+                                    fill=(255, 255, 255, 30))
+
+                        draw.text((text_x, text_y), label_type, font=font, fill=color)
+                    except:
+                        pass
+        except:
+            continue
+    img_draw.paste(overlay, (0, 0), overlay)
+    return img_draw
+
+def process_image_with_refs(image, ref_texts, jdx):
+    result_image = draw_bounding_boxes(image, ref_texts, jdx)
+    return result_image
+
+def process_single_image(image):
+    """single image"""
+    prompt_in = prompt
+    cache_item = {
+        "prompt": prompt_in,
+        "multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
+    }
+    return cache_item
+
+
+if __name__ == "__main__":
+
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+    os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
+
+    print(f'{Colors.RED}PDF loading .....{Colors.RESET}')
+
+    images = pdf_to_images_high_quality(INPUT_PATH)
+    prompt = PROMPT
+
+    # batch_inputs = []
+
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        batch_inputs = list(tqdm(
+            executor.map(process_single_image, images),
+            total=len(images),
+            desc="Pre-processed images"
+        ))
+
+    # for image in tqdm(images):
+
+    #     prompt_in = prompt
+    #     cache_list = [
+    #         {
+    #             "prompt": prompt_in,
+    #             "multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
+    #         }
+    #     ]
+    #     batch_inputs.extend(cache_list)
+
+
+    outputs_list = llm.generate(
+        batch_inputs,
+        sampling_params=sampling_params
+    )
+
+    output_path = OUTPUT_PATH
+
+    os.makedirs(output_path, exist_ok=True)
+
+    mmd_det_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_det.mmd')
+    mmd_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('pdf', 'mmd')
+    pdf_out_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_layouts.pdf')
+    contents_det = ''
+    contents = ''
+    draw_images = []
+    jdx = 0
+    for output, img in zip(outputs_list, images):
+        content = output.outputs[0].text
+
+        if '<｜end▁of▁sentence｜>' in content: # repeat no eos
+            content = content.replace('<｜end▁of▁sentence｜>', '')
+        else:
+            if SKIP_REPEAT:
+                continue
+
+        page_num = f'\n<--- Page Split --->'
+
+        contents_det += content + f'\n{page_num}\n'
+
+        image_draw = img.copy()
+
+        matches_ref, matches_images, mathes_other = re_match(content)
+        # print(matches_ref)
+        result_image = process_image_with_refs(image_draw, matches_ref, jdx)
+
+
+        draw_images.append(result_image)
+
+
+        for idx, a_match_image in enumerate(matches_images):
+            content = content.replace(a_match_image, f'![](images/' + str(jdx) + '_' + str(idx) + '.jpg)\n')
+
+        for idx, a_match_other in enumerate(mathes_other):
+            content = content.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:').replace('\n\n\n\n', '\n\n').replace('\n\n\n', '\n\n')
+
+
+        contents += content + f'\n{page_num}\n'
+
+
+        jdx += 1
+
+    with open(mmd_det_path, 'w', encoding='utf-8') as afile:
+        afile.write(contents_det)
+
+    with open(mmd_path, 'w', encoding='utf-8') as afile:
+        afile.write(contents)
+
+
+    pil_to_pdf_img2pdf(draw_images, pdf_out_path)
+
--- a/DeepSeek-OCR-vllm/use.pdf
+++ b/DeepSeek-OCR-vllm/use.pdf
--- a/DeepSeek-OCR2-vllm/.env
+++ b/DeepSeek-OCR2-vllm/.env
+# DeepSeek OCR 服务配置
+# 请根据你的实际情况修改以下配置
+
+# 模型路径（必需）
+MODEL_PATH=/home/lst/deepseek_ocr2
+
+INPUT_PATH="./use.pdf"
+
+# GPU 配置
+GPU_ID=0
+
+# 服务端口
+PORT=8001
+
+# CPU工作线程数
+CPU_WORKERS=2
+
+# PYTHON环境配置
+PYTHON_PATH=/usr/bin/python3
+
+# PYTHON 版本（仅在使用 conda 环境名称时需要）
+PYTHON_VERSION=3.10
+
+# 服务配置
+HOST=0.0.0.0
--- a/DeepSeek-OCR2-vllm/.env.example
+++ b/DeepSeek-OCR2-vllm/.env.example
+# DeepSeek OCR 服务配置示例
+# 复制此文件为 .env 并修改以下配置
+
+# ============================================================================
+# 核心配置（必须修改）
+# ============================================================================
+
+# 模型路径 - 请修改为你的实际模型路径
+MODEL_PATH=/path/to/your/DeepSeek-OCR
+
+# ============================================================================
+# GPU 和服务配置（可选）
+# ============================================================================
+
+# GPU ID（默认使用 GPU 3）
+GPU_ID=3
+
+# 服务端口（默认 8708）
+PORT=8708
+
+# 服务监听地址（默认 0.0.0.0，允许外部访问）
+HOST=0.0.0.0
+
+# CPU 工作线程数（默认 2，用于图像预处理）
+CPU_WORKERS=2
+
+# ============================================================================
+# Python 环境配置（二选一）
+# ============================================================================
+
+# 方式1: 使用 conda 环境名称（需要 conda 命令）
+# CONDA_ENV_NAME=deepseek-ocr-vllm
+
+# 方式2: 直接使用 miniconda 环境路径（推荐用于 miniconda）
+# PYTHON_PATH=/home/data/nongwa/miniconda3/envs/your_env/bin/python
+
+# Python 版本（仅在使用 conda 环境名称时需要）
+PYTHON_VERSION=3.10
\ No newline at end of file
--- a/DeepSeek-OCR2-vllm/__pycache__/config.cpython-310.pyc
+++ b/DeepSeek-OCR2-vllm/__pycache__/config.cpython-310.pyc
--- a/DeepSeek-OCR2-vllm/__pycache__/config.cpython-312.pyc
+++ b/DeepSeek-OCR2-vllm/__pycache__/config.cpython-312.pyc
--- a/DeepSeek-OCR2-vllm/__pycache__/deepseek_ocr.cpython-312.pyc
+++ b/DeepSeek-OCR2-vllm/__pycache__/deepseek_ocr.cpython-312.pyc
--- a/DeepSeek-OCR2-vllm/__pycache__/deepseek_ocr2.cpython-310.pyc
+++ b/DeepSeek-OCR2-vllm/__pycache__/deepseek_ocr2.cpython-310.pyc
--- a/DeepSeek-OCR2-vllm/__pycache__/deepseek_ocr2.cpython-312.pyc
+++ b/DeepSeek-OCR2-vllm/__pycache__/deepseek_ocr2.cpython-312.pyc
--- a/DeepSeek-OCR2-vllm/config.py
+++ b/DeepSeek-OCR2-vllm/config.py
+
+BASE_SIZE = 1024
+IMAGE_SIZE = 768
+CROP_MODE = True
+MIN_CROPS= 2
+MAX_CROPS= 6 # max:6
+MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
+NUM_WORKERS = 64 # image pre-process (resize/padding) workers 
+PRINT_NUM_VIS_TOKENS = False
+SKIP_REPEAT = True
+MODEL_PATH = '/home/lst/deepseek_ocr2' # change to your model path
+
+# TODO: change INPUT_PATH
+# .pdf: run_dpsk_ocr_pdf.py; 
+# .jpg, .png, .jpeg: run_dpsk_ocr_image.py; 
+# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
+
+
+
+INPUT_PATH = './use.pdf'
+OUTPUT_PATH = './output'
+
+PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
+# PROMPT = '<image>\nFree OCR.'
+# .......
+
+
+from transformers import AutoTokenizer
+
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
--- a/DeepSeek-OCR2-vllm/deepencoderv2/__init__.py
+++ b/DeepSeek-OCR2-vllm/deepencoderv2/__init__.py
--- a/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/__init__.cpython-310.pyc
+++ b/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/__init__.cpython-310.pyc
--- a/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/__init__.cpython-312.pyc
+++ b/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/__init__.cpython-312.pyc
--- a/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/build_linear.cpython-310.pyc
+++ b/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/build_linear.cpython-310.pyc
--- a/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/build_linear.cpython-312.pyc
+++ b/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/build_linear.cpython-312.pyc
--- a/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/clip_sdpa.cpython-312.pyc
+++ b/DeepSeek-OCR2-vllm/deepencoderv2/__pycache__/clip_sdpa.cpython-312.pyc