Merge pull request #2625 from opendatalab/release-2.0.0

Release 2.0.0

Merge pull request #2625 from opendatalab/release-2.0.0
Release 2.0.0
6ab12348 · Xiaomeng Zhao · GitHub · 9487d33d · 4fbec469 · 9487d33d
Unverified Commit 6ab12348 authored Jun 13, 2025 by Xiaomeng Zhao Committed by GitHub Jun 13, 2025
20 changed files
--- a/magic_pdf/pre_proc/__init__.py
+++ b/magic_pdf/pre_proc/__init__.py
--- a/magic_pdf/pre_proc/construct_page_dict.py
+++ b/magic_pdf/pre_proc/construct_page_dict.py
-
-def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                                    images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
-    return_dict = {
-        'preproc_blocks': blocks,
-        'layout_bboxes': layout_bboxes,
-        'page_idx': page_id,
-        'page_size': [page_w, page_h],
-        '_layout_tree': layout_tree,
-        'images': images,
-        'tables': tables,
-        'interline_equations': interline_equations,
-        'discarded_blocks': discarded_blocks,
-        'need_drop': need_drop,
-        'drop_reason': drop_reason,
-    }
-    return return_dict
--- a/magic_pdf/pre_proc/cut_image.py
+++ b/magic_pdf/pre_proc/cut_image.py
-from loguru import logger
-
-from magic_pdf.config.ocr_content_type import ContentType
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.pdf_image_tools import cut_image
-
-
-def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
-    def return_path(type):
-        return join_path(pdf_bytes_md5, type)
-
-    for span in spans:
-        span_type = span['type']
-        if span_type == ContentType.Image:
-            if not check_img_bbox(span['bbox']) or not imageWriter:
-                continue
-            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
-                                           imageWriter=imageWriter)
-        elif span_type == ContentType.Table:
-            if not check_img_bbox(span['bbox']) or not imageWriter:
-                continue
-            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
-                                           imageWriter=imageWriter)
-
-    return spans
-
-
-def check_img_bbox(bbox) -> bool:
-    if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-        logger.warning(f'image_bboxes: 错误的box, {bbox}')
-        return False
-    return True
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
-
-from magic_pdf.config.drop_tag import DropTag
-from magic_pdf.config.ocr_content_type import BlockType
-from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
-
-
-def remove_overlaps_low_confidence_spans(spans):
-    dropped_spans = []
-    #  删除重叠spans中置信度低的的那些
-    for span1 in spans:
-        for span2 in spans:
-            if span1 != span2:
-                # span1 或 span2 任何一个都不应该在 dropped_spans 中
-                if span1 in dropped_spans or span2 in dropped_spans:
-                    continue
-                else:
-                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
-                        if span1['score'] < span2['score']:
-                            span_need_remove = span1
-                        else:
-                            span_need_remove = span2
-                        if (
-                            span_need_remove is not None
-                            and span_need_remove not in dropped_spans
-                        ):
-                            dropped_spans.append(span_need_remove)
-
-    if len(dropped_spans) > 0:
-        for span_need_remove in dropped_spans:
-            spans.remove(span_need_remove)
-            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
-
-    return spans, dropped_spans
-
-
-def check_chars_is_overlap_in_span(chars):
-    for i in range(len(chars)):
-        for j in range(i + 1, len(chars)):
-            if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.35:
-                return True
-    return False
-
-
-def remove_x_overlapping_chars(span, median_width):
-    """
-    Remove characters from a span that overlap significantly on the x-axis.
-
-    Args:
-        median_width:
-        span (dict): A span containing a list of chars, each with bbox coordinates
-                    in the format [x0, y0, x1, y1]
-
-    Returns:
-        dict: The span with overlapping characters removed
-    """
-    if 'chars' not in span or len(span['chars']) < 2:
-        return span
-
-    overlap_threshold = median_width * 0.3
-
-    i = 0
-    while i < len(span['chars']) - 1:
-        char1 = span['chars'][i]
-        char2 = span['chars'][i + 1]
-
-        # Calculate overlap width
-        x_left = max(char1['bbox'][0], char2['bbox'][0])
-        x_right = min(char1['bbox'][2], char2['bbox'][2])
-
-        if x_right > x_left:  # There is overlap
-            overlap_width = x_right - x_left
-
-            if overlap_width > overlap_threshold:
-                if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
-                    # Determine which character to remove
-                    width1 = char1['bbox'][2] - char1['bbox'][0]
-                    width2 = char2['bbox'][2] - char2['bbox'][0]
-                    if width1 < width2:
-                        # Remove the narrower character
-                        span['chars'].pop(i)
-                    else:
-                        span['chars'].pop(i + 1)
-                else:
-                    i += 1
-
-                # Don't increment i since we need to check the new pair
-            else:
-                i += 1
-        else:
-            i += 1
-
-    return span
-
-
-def remove_overlaps_min_spans(spans):
-    dropped_spans = []
-    #  删除重叠spans中较小的那些
-    for span1 in spans:
-        for span2 in spans:
-            if span1 != span2:
-                # span1 或 span2 任何一个都不应该在 dropped_spans 中
-                if span1 in dropped_spans or span2 in dropped_spans:
-                    continue
-                else:
-                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
-                    if overlap_box is not None:
-                        span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
-                        if span_need_remove is not None and span_need_remove not in dropped_spans:
-                            dropped_spans.append(span_need_remove)
-    if len(dropped_spans) > 0:
-        for span_need_remove in dropped_spans:
-            spans.remove(span_need_remove)
-            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
-
-    return spans, dropped_spans
-
-
-def get_qa_need_list_v2(blocks):
-    # 创建 images, tables, interline_equations, inline_equations 的副本
-    images = []
-    tables = []
-    interline_equations = []
-
-    for block in blocks:
-        if block['type'] == BlockType.Image:
-            images.append(block)
-        elif block['type'] == BlockType.Table:
-            tables.append(block)
-        elif block['type'] == BlockType.InterlineEquation:
-            interline_equations.append(block)
-    return images, tables, interline_equations
--- a/magic_pdf/pre_proc/remove_bbox_overlap.py
+++ b/magic_pdf/pre_proc/remove_bbox_overlap.py
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.boxbase import _is_in, _is_part_overlap
-
-
-def _remove_overlap_between_bbox(bbox1, bbox2):
-    if _is_part_overlap(bbox1, bbox2):
-        ix0, iy0, ix1, iy1 = bbox1
-        x0, y0, x1, y1 = bbox2
-
-        diff_x = min(x1, ix1) - max(x0, ix0)
-        diff_y = min(y1, iy1) - max(y0, iy0)
-
-        if diff_y > diff_x:
-            if x1 >= ix1:
-                mid = (x0 + ix1) // 2
-                ix1 = min(mid - 0.25, ix1)
-                x0 = max(mid + 0.25, x0)
-            else:
-                mid = (ix0 + x1) // 2
-                ix0 = max(mid + 0.25, ix0)
-                x1 = min(mid - 0.25, x1)
-        else:
-            if y1 >= iy1:
-                mid = (y0 + iy1) // 2
-                y0 = max(mid + 0.25, y0)
-                iy1 = min(iy1, mid - 0.25)
-            else:
-                mid = (iy0 + y1) // 2
-                y1 = min(y1, mid - 0.25)
-                iy0 = max(mid + 0.25, iy0)
-
-        if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
-            bbox1 = [ix0, iy0, ix1, iy1]
-            bbox2 = [x0, y0, x1, y1]
-            return bbox1, bbox2, None
-        else:
-            return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
-    else:
-        return bbox1, bbox2, None
-
-
-def _remove_overlap_between_bboxes(arr):
-    drop_reasons = []
-    N = len(arr)
-    keeps = [True] * N
-    res = [None] * N
-    for i in range(N):
-        for j in range(N):
-            if i == j:
-                continue
-            if _is_in(arr[i]['bbox'], arr[j]['bbox']):
-                keeps[i] = False
-
-    for idx, v in enumerate(arr):
-        if not keeps[idx]:
-            continue
-        for i in range(N):
-            if res[i] is None:
-                continue
-
-            bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(
-                v['bbox'], res[i]['bbox']
-            )
-            if drop_reason is None:
-                v['bbox'] = bbox1
-                res[i]['bbox'] = bbox2
-            else:
-                if v['score'] > res[i]['score']:
-                    keeps[i] = False
-                    res[i] = None
-                else:
-                    keeps[idx] = False
-                drop_reasons.append(drop_reason)
-        if keeps[idx]:
-            res[idx] = v
-    return res, drop_reasons
-
-
-def remove_overlap_between_bbox_for_span(spans):
-    arr = [{'bbox': span['bbox'], 'score': span.get('score', 0.1)} for span in spans]
-    res, drop_reasons = _remove_overlap_between_bboxes(arr)
-    ret = []
-    for i in range(len(res)):
-        if res[i] is None:
-            continue
-        spans[i]['bbox'] = res[i]['bbox']
-        ret.append(spans[i])
-    return ret, drop_reasons
-
-
-def remove_overlap_between_bbox_for_block(all_bboxes):
-    arr = [{'bbox': bbox[:4], 'score': bbox[-1]} for bbox in all_bboxes]
-    res, drop_reasons = _remove_overlap_between_bboxes(arr)
-    ret = []
-    for i in range(len(res)):
-        if res[i] is None:
-            continue
-        all_bboxes[i][:4] = res[i]['bbox']
-        ret.append(all_bboxes[i])
-    return ret, drop_reasons
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
-weights:
-  layoutlmv3: Layout/LayoutLMv3/model_final.pth
-  doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
-  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
-  unimernet_small: MFR/unimernet_hf_small_2503
-  struct_eqtable: TabRec/StructEqTable
-  tablemaster: TabRec/TableMaster
-  rapid_table: TabRec/RapidTable
\ No newline at end of file
--- a/magic_pdf/resources/slanet_plus/slanet-plus.onnx
+++ b/magic_pdf/resources/slanet_plus/slanet-plus.onnx
--- a/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
+++ b/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
--- a/magic_pdf/spark/__init__.py
+++ b/magic_pdf/spark/__init__.py
--- a/magic_pdf/spark/spark_api.py
+++ b/magic_pdf/spark/spark_api.py
-from loguru import logger
-
-from magic_pdf.config.drop_reason import DropReason
-
-
-def get_data_source(jso: dict):
-    data_source = jso.get('data_source')
-    if data_source is None:
-        data_source = jso.get('file_source')
-    return data_source
-
-
-def get_data_type(jso: dict):
-    data_type = jso.get('data_type')
-    if data_type is None:
-        data_type = jso.get('file_type')
-    return data_type
-
-
-def get_bookid(jso: dict):
-    book_id = jso.get('bookid')
-    if book_id is None:
-        book_id = jso.get('original_file_id')
-    return book_id
-
-
-def exception_handler(jso: dict, e):
-    logger.exception(e)
-    jso['_need_drop'] = True
-    jso['_drop_reason'] = DropReason.Exception
-    jso['_exception'] = f'ERROR: {e}'
-    return jso
-
-
-def get_bookname(jso: dict):
-    data_source = get_data_source(jso)
-    file_id = jso.get('file_id')
-    book_name = f'{data_source}/{file_id}'
-    return book_name
-
-
-def spark_json_extractor(jso: dict) -> dict:
-
-    """从json中提取数据，返回一个dict."""
-
-    return {
-        '_pdf_type': jso['_pdf_type'],
-        'model_list': jso['doc_layout_result'],
-    }
--- a/magic_pdf/tools/__init__.py
+++ b/magic_pdf/tools/__init__.py
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
-import os
-import shutil
-import tempfile
-from pathlib import Path
-
-import click
-import fitz
-from loguru import logger
-
-import magic_pdf.model as model_config
-from magic_pdf.data.batch_build_dataset import batch_build_dataset
-from magic_pdf.data.data_reader_writer import FileBasedDataReader
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.libs.version import __version__
-from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
-from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
-
-pdf_suffixes = ['.pdf']
-ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
-image_suffixes = ['.png', '.jpeg', '.jpg']
-
-
-@click.command()
-@click.version_option(__version__,
-                      '--version',
-                      '-v',
-                      help='display the version and exit')
-@click.option(
-    '-p',
-    '--path',
-    'path',
-    type=click.Path(exists=True),
-    required=True,
-    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
-)
-@click.option(
-    '-o',
-    '--output-dir',
-    'output_dir',
-    type=click.Path(),
-    required=True,
-    help='output local directory',
-)
-@click.option(
-    '-m',
-    '--method',
-    'method',
-    type=parse_pdf_methods,
-    help="""the method for parsing pdf.
-ocr: using ocr technique to extract information from pdf.
-txt: suitable for the text-based pdf only and outperform ocr.
-auto: automatically choose the best method for parsing pdf from ocr and txt.
-without method specified, auto will be used by default.""",
-    default='auto',
-)
-@click.option(
-    '-l',
-    '--lang',
-    'lang',
-    type=str,
-    help="""
-    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
-    You should input "Abbreviation" with language form url:
-    https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
-    """,
-    default=None,
-)
-@click.option(
-    '-d',
-    '--debug',
-    'debug_able',
-    type=bool,
-    help='Enables detailed debugging information during the execution of the CLI commands.',
-    default=False,
-)
-@click.option(
-    '-s',
-    '--start',
-    'start_page_id',
-    type=int,
-    help='The starting page for PDF parsing, beginning from 0.',
-    default=0,
-)
-@click.option(
-    '-e',
-    '--end',
-    'end_page_id',
-    type=int,
-    help='The ending page for PDF parsing, beginning from 0.',
-    default=None,
-)
-def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
-    os.makedirs(output_dir, exist_ok=True)
-    temp_dir = tempfile.mkdtemp()
-    def read_fn(path: Path):
-        if path.suffix in ms_office_suffixes:
-            convert_file_to_pdf(str(path), temp_dir)
-            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
-        elif path.suffix in image_suffixes:
-            with open(str(path), 'rb') as f:
-                bits = f.read()
-            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
-            with open(fn, 'wb') as f:
-                f.write(pdf_bytes)
-        elif path.suffix in pdf_suffixes:
-            fn = str(path)
-        else:
-            raise Exception(f'Unknown file suffix: {path.suffix}')
-
-        disk_rw = FileBasedDataReader(os.path.dirname(fn))
-        return disk_rw.read(os.path.basename(fn))
-
-    def parse_doc(doc_path: Path, dataset: Dataset | None = None):
-        try:
-            file_name = str(Path(doc_path).stem)
-            if dataset is None:
-                pdf_data_or_dataset = read_fn(doc_path)
-            else:
-                pdf_data_or_dataset = dataset
-            do_parse(
-                output_dir,
-                file_name,
-                pdf_data_or_dataset,
-                [],
-                method,
-                debug_able,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-                lang=lang
-            )
-
-        except Exception as e:
-            logger.exception(e)
-
-    if os.path.isdir(path):
-        doc_paths = []
-        for doc_path in Path(path).glob('*'):
-            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
-                if doc_path.suffix in ms_office_suffixes:
-                    convert_file_to_pdf(str(doc_path), temp_dir)
-                    doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
-                elif doc_path.suffix in image_suffixes:
-                    with open(str(doc_path), 'rb') as f:
-                        bits = f.read()
-                        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-                    fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
-                    with open(fn, 'wb') as f:
-                        f.write(pdf_bytes)
-                    doc_path = Path(fn)
-                doc_paths.append(doc_path)
-        datasets = batch_build_dataset(doc_paths, 4, lang)
-        batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
-    else:
-        parse_doc(Path(path))
-
-    shutil.rmtree(temp_dir)
-
-
-if __name__ == '__main__':
-    cli()
--- a/magic_pdf/tools/cli_dev.py
+++ b/magic_pdf/tools/cli_dev.py
-import json as json_parse
-import os
-from pathlib import Path
-
-import click
-
-import magic_pdf.model as model_config
-from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
-from magic_pdf.libs.config_reader import get_s3_config
-from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
-                                       remove_non_official_s3_args)
-from magic_pdf.libs.version import __version__
-from magic_pdf.tools.common import do_parse, parse_pdf_methods
-
-
-def read_s3_path(s3path):
-    bucket, key = parse_s3path(s3path)
-
-    s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
-    s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
-    may_range_params = parse_s3_range_params(s3path)
-    if may_range_params is None or 2 != len(may_range_params):
-        byte_start, byte_end = 0, -1
-    else:
-        byte_start, byte_end = int(may_range_params[0]), int(
-            may_range_params[1])
-    return s3_rw.read_at(
-        remove_non_official_s3_args(s3path),
-        byte_start,
-        byte_end,
-    )
-
-
-@click.group()
-@click.version_option(__version__, '--version', '-v', help='显示版本信息')
-def cli():
-    pass
-
-
-@cli.command()
-@click.option(
-    '-j',
-    '--jsonl',
-    'jsonl',
-    type=str,
-    help='输入 jsonl 路径，本地或者 s3 上的文件',
-    required=True,
-)
-@click.option(
-    '-m',
-    '--method',
-    'method',
-    type=parse_pdf_methods,
-    help='指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
-    default='auto',
-)
-@click.option(
-    '-o',
-    '--output-dir',
-    'output_dir',
-    type=click.Path(),
-    required=True,
-    help='输出到本地目录',
-)
-def jsonl(jsonl, method, output_dir):
-    model_config.__use_inside_model__ = False
-    if jsonl.startswith('s3://'):
-        jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
-    else:
-        with open(jsonl) as f:
-            jso = json_parse.loads(f.readline())
-    os.makedirs(output_dir, exist_ok=True)
-    s3_file_path = jso.get('file_location')
-    if s3_file_path is None:
-        s3_file_path = jso.get('path')
-    pdf_file_name = Path(s3_file_path).stem
-    pdf_data = read_s3_path(s3_file_path)
-
-    print(pdf_file_name, jso, method)
-    do_parse(
-        output_dir,
-        pdf_file_name,
-        pdf_data,
-        jso['doc_layout_result'],
-        method,
-        False,
-        f_dump_content_list=True,
-        f_draw_model_bbox=True,
-    )
-
-
-@cli.command()
-@click.option(
-    '-p',
-    '--pdf',
-    'pdf',
-    type=click.Path(exists=True),
-    required=True,
-    help='本地 PDF 文件',
-)
-@click.option(
-    '-j',
-    '--json',
-    'json_data',
-    type=click.Path(exists=True),
-    required=True,
-    help='本地模型推理出的 json 数据',
-)
-@click.option('-o',
-              '--output-dir',
-              'output_dir',
-              type=click.Path(),
-              required=True,
-              help='本地输出目录')
-@click.option(
-    '-m',
-    '--method',
-    'method',
-    type=parse_pdf_methods,
-    help='指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
-    default='auto',
-)
-def pdf(pdf, json_data, output_dir, method):
-    model_config.__use_inside_model__ = False
-    full_pdf_path = os.path.realpath(pdf)
-    os.makedirs(output_dir, exist_ok=True)
-
-    def read_fn(path):
-        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path))
-
-    model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
-
-    file_name = str(Path(full_pdf_path).stem)
-    pdf_data = read_fn(full_pdf_path)
-    do_parse(
-        output_dir,
-        file_name,
-        pdf_data,
-        model_json_list,
-        method,
-        False,
-        f_dump_content_list=True,
-        f_draw_model_bbox=True,
-    )
-
-
-if __name__ == '__main__':
-    cli()
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
-import os
-
-import click
-import fitz
-from loguru import logger
-
-import magic_pdf.model as model_config
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.config.make_content_config import DropMode, MakeMode
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-from magic_pdf.data.dataset import Dataset, PymuDocDataset
-from magic_pdf.libs.draw_bbox import draw_char_bbox
-from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
-                                                         doc_analyze)
-
-# from io import BytesIO
-# from pypdf import PdfReader, PdfWriter
-
-
-def prepare_env(output_dir, pdf_file_name, method):
-    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
-
-    local_image_dir = os.path.join(str(local_parent_dir), 'images')
-    local_md_dir = local_parent_dir
-    os.makedirs(local_image_dir, exist_ok=True)
-    os.makedirs(local_md_dir, exist_ok=True)
-    return local_image_dir, local_md_dir
-
-
-# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
-#     # 将字节数据包装在 BytesIO 对象中
-#     pdf_file = BytesIO(pdf_bytes)
-#     # 读取 PDF 的字节数据
-#     reader = PdfReader(pdf_file)
-#     # 创建一个新的 PDF 写入器
-#     writer = PdfWriter()
-#     # 将所有页面添加到新的 PDF 写入器中
-#     end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
-#     if end_page_id > len(reader.pages) - 1:
-#         logger.warning("end_page_id is out of range, use pdf_docs length")
-#         end_page_id = len(reader.pages) - 1
-#     for i, page in enumerate(reader.pages):
-#         if start_page_id <= i <= end_page_id:
-#             writer.add_page(page)
-#     # 创建一个字节缓冲区来存储输出的 PDF 数据
-#     output_buffer = BytesIO()
-#     # 将 PDF 写入字节缓冲区
-#     writer.write(output_buffer)
-#     # 获取字节缓冲区的内容
-#     converted_pdf_bytes = output_buffer.getvalue()
-#     return converted_pdf_bytes
-
-
-def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
-    document = fitz.open('pdf', pdf_bytes)
-    output_document = fitz.open()
-    end_page_id = (
-        end_page_id
-        if end_page_id is not None and end_page_id >= 0
-        else len(document) - 1
-    )
-    if end_page_id > len(document) - 1:
-        logger.warning('end_page_id is out of range, use pdf_docs length')
-        end_page_id = len(document) - 1
-    output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
-    output_bytes = output_document.tobytes()
-    return output_bytes
-
-
-def _do_parse(
-    output_dir,
-    pdf_file_name,
-    pdf_bytes_or_dataset,
-    model_list,
-    parse_method,
-    debug_able=False,
-    f_draw_span_bbox=True,
-    f_draw_layout_bbox=True,
-    f_dump_md=True,
-    f_dump_middle_json=True,
-    f_dump_model_json=True,
-    f_dump_orig_pdf=True,
-    f_dump_content_list=True,
-    f_make_md_mode=MakeMode.MM_MD,
-    f_draw_model_bbox=False,
-    f_draw_line_sort_bbox=False,
-    f_draw_char_bbox=False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    layout_model=None,
-    formula_enable=None,
-    table_enable=None,
-):
-    from magic_pdf.operators.models import InferenceResult
-    if debug_able:
-        logger.warning('debug mode is on')
-        f_draw_model_bbox = True
-        f_draw_line_sort_bbox = True
-        # f_draw_char_bbox = True
-
-    if isinstance(pdf_bytes_or_dataset, bytes):
-        pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
-            pdf_bytes_or_dataset, start_page_id, end_page_id
-        )
-        ds = PymuDocDataset(pdf_bytes, lang=lang)
-    else:
-        ds = pdf_bytes_or_dataset
-    pdf_bytes = ds._raw_data
-    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
-    image_dir = str(os.path.basename(local_image_dir))
-
-    if len(model_list) == 0:
-        if model_config.__use_inside_model__:
-            if parse_method == 'auto':
-                if ds.classify() == SupportedPdfParseMethod.TXT:
-                    infer_result = ds.apply(
-                        doc_analyze,
-                        ocr=False,
-                        lang=ds._lang,
-                        layout_model=layout_model,
-                        formula_enable=formula_enable,
-                        table_enable=table_enable,
-                    )
-                    pipe_result = infer_result.pipe_txt_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-                else:
-                    infer_result = ds.apply(
-                        doc_analyze,
-                        ocr=True,
-                        lang=ds._lang,
-                        layout_model=layout_model,
-                        formula_enable=formula_enable,
-                        table_enable=table_enable,
-                    )
-                    pipe_result = infer_result.pipe_ocr_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-
-            elif parse_method == 'txt':
-                infer_result = ds.apply(
-                    doc_analyze,
-                    ocr=False,
-                    lang=ds._lang,
-                    layout_model=layout_model,
-                    formula_enable=formula_enable,
-                    table_enable=table_enable,
-                )
-                pipe_result = infer_result.pipe_txt_mode(
-                    image_writer, debug_mode=True, lang=ds._lang
-                )
-            elif parse_method == 'ocr':
-                infer_result = ds.apply(
-                    doc_analyze,
-                    ocr=True,
-                    lang=ds._lang,
-                    layout_model=layout_model,
-                    formula_enable=formula_enable,
-                    table_enable=table_enable,
-                )
-                pipe_result = infer_result.pipe_ocr_mode(
-                    image_writer, debug_mode=True, lang=ds._lang
-                )
-            else:
-                logger.error('unknown parse method')
-                exit(1)
-        else:
-            logger.error('need model list input')
-            exit(2)
-    else:
-
-        infer_result = InferenceResult(model_list, ds)
-        if parse_method == 'ocr':
-            pipe_result = infer_result.pipe_ocr_mode(
-                image_writer, debug_mode=True, lang=ds._lang
-            )
-        elif parse_method == 'txt':
-            pipe_result = infer_result.pipe_txt_mode(
-                image_writer, debug_mode=True, lang=ds._lang
-            )
-        else:
-            if ds.classify() == SupportedPdfParseMethod.TXT:
-                pipe_result = infer_result.pipe_txt_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-            else:
-                pipe_result = infer_result.pipe_ocr_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-
-
-    if f_draw_model_bbox:
-        infer_result.draw_model(
-            os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
-        )
-
-    if f_draw_layout_bbox:
-        pipe_result.draw_layout(
-            os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
-        )
-    if f_draw_span_bbox:
-        pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
-
-    if f_draw_line_sort_bbox:
-        pipe_result.draw_line_sort(
-            os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
-        )
-
-    if f_draw_char_bbox:
-        draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
-
-    if f_dump_md:
-        pipe_result.dump_md(
-            md_writer,
-            f'{pdf_file_name}.md',
-            image_dir,
-            drop_mode=DropMode.NONE,
-            md_make_mode=f_make_md_mode,
-        )
-
-    if f_dump_middle_json:
-        pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
-
-    if f_dump_model_json:
-        infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
-
-    if f_dump_orig_pdf:
-        md_writer.write(
-            f'{pdf_file_name}_origin.pdf',
-            pdf_bytes,
-        )
-
-    if f_dump_content_list:
-        pipe_result.dump_content_list(
-            md_writer,
-            f'{pdf_file_name}_content_list.json',
-            image_dir
-        )
-
-    logger.info(f'local output dir is {local_md_dir}')
-
-def do_parse(
-    output_dir,
-    pdf_file_name,
-    pdf_bytes_or_dataset,
-    model_list,
-    parse_method,
-    debug_able=False,
-    f_draw_span_bbox=True,
-    f_draw_layout_bbox=True,
-    f_dump_md=True,
-    f_dump_middle_json=True,
-    f_dump_model_json=True,
-    f_dump_orig_pdf=True,
-    f_dump_content_list=True,
-    f_make_md_mode=MakeMode.MM_MD,
-    f_draw_model_bbox=False,
-    f_draw_line_sort_bbox=False,
-    f_draw_char_bbox=False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    layout_model=None,
-    formula_enable=None,
-    table_enable=None,
-):
-    parallel_count = 1
-    if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
-        parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
-
-    if parallel_count > 1:
-        if isinstance(pdf_bytes_or_dataset, bytes):
-            pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
-                pdf_bytes_or_dataset, start_page_id, end_page_id
-            )
-            ds = PymuDocDataset(pdf_bytes, lang=lang)
-        else:
-            ds = pdf_bytes_or_dataset
-        batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
-    else:
-        _do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable,  f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
-
-
-def batch_do_parse(
-    output_dir,
-    pdf_file_names: list[str],
-    pdf_bytes_or_datasets: list[bytes | Dataset],
-    parse_method,
-    debug_able=False,
-    f_draw_span_bbox=True,
-    f_draw_layout_bbox=True,
-    f_dump_md=True,
-    f_dump_middle_json=True,
-    f_dump_model_json=True,
-    f_dump_orig_pdf=True,
-    f_dump_content_list=True,
-    f_make_md_mode=MakeMode.MM_MD,
-    f_draw_model_bbox=False,
-    f_draw_line_sort_bbox=False,
-    f_draw_char_bbox=False,
-    lang=None,
-    layout_model=None,
-    formula_enable=None,
-    table_enable=None,
-):
-    dss = []
-    for v in pdf_bytes_or_datasets:
-        if isinstance(v, bytes):
-            dss.append(PymuDocDataset(v, lang=lang))
-        else:
-            dss.append(v)
-
-    infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
-    for idx, infer_result in enumerate(infer_results):
-        _do_parse(
-            output_dir = output_dir,
-            pdf_file_name = pdf_file_names[idx],
-            pdf_bytes_or_dataset = dss[idx],
-            model_list = infer_result.get_infer_res(),
-            parse_method = parse_method,
-            debug_able = debug_able,
-            f_draw_span_bbox = f_draw_span_bbox,
-            f_draw_layout_bbox = f_draw_layout_bbox,
-            f_dump_md=f_dump_md,
-            f_dump_middle_json=f_dump_middle_json,
-            f_dump_model_json=f_dump_model_json,
-            f_dump_orig_pdf=f_dump_orig_pdf,
-            f_dump_content_list=f_dump_content_list,
-            f_make_md_mode=MakeMode.MM_MD,
-            f_draw_model_bbox=f_draw_model_bbox,
-            f_draw_line_sort_bbox=f_draw_line_sort_bbox,
-            f_draw_char_bbox=f_draw_char_bbox,
-            lang=lang,
-        )
-
-
-parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
--- a/magic_pdf/utils/__init__.py
+++ b/magic_pdf/utils/__init__.py
--- a/magic_pdf/utils/annotations.py
+++ b/magic_pdf/utils/annotations.py
-
-from loguru import logger
-
-
-def ImportPIL(f):
-    try:
-        import PIL  # noqa: F401
-    except ImportError:
-        logger.error('Pillow not installed, please install by pip.')
-        exit(1)
-    return f
--- a/magic_pdf/utils/office_to_pdf.py
+++ b/magic_pdf/utils/office_to_pdf.py
-import os
-import subprocess
-import platform
-from pathlib import Path
-import shutil
-
-from loguru import logger
-
-
-class ConvertToPdfError(Exception):
-    def __init__(self, msg):
-        self.msg = msg
-        super().__init__(self.msg)
-
-
-def check_fonts_installed():
-    """Check if required Chinese fonts are installed."""
-    system_type = platform.system()
-
-    if system_type in ['Windows', 'Darwin']:
-        pass
-    else:
-        # Linux: use fc-list
-        try:
-            output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
-            if output.strip():  # 只要有任何输出（非空）
-                return True
-            else:
-                logger.warning(
-                    f"No Chinese fonts were detected, the converted document may not display Chinese content properly."
-                )
-        except Exception:
-            pass
-
-
-def get_soffice_command():
-    """Return the path to LibreOffice's soffice executable depending on the platform."""
-    system_type = platform.system()
-
-    # First check if soffice is in PATH
-    soffice_path = shutil.which('soffice')
-    if soffice_path:
-        return soffice_path
-
-    if system_type == 'Windows':
-        # Check common installation paths
-        possible_paths = [
-            Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
-            Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
-            Path('C:/Program Files/LibreOffice/program/soffice.exe'),
-            Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
-        ]
-
-        # Check other drives for windows
-        for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
-            possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
-
-        for path in possible_paths:
-            if path.exists():
-                return str(path)
-
-        raise ConvertToPdfError(
-            "LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
-            "or ensure soffice.exe is in your PATH environment variable."
-        )
-    else:
-        # For Linux/macOS, provide installation instructions if not found
-        try:
-            # Try to find soffice in standard locations
-            possible_paths = [
-                '/usr/bin/soffice',
-                '/usr/local/bin/soffice',
-                '/opt/libreoffice/program/soffice',
-                '/Applications/LibreOffice.app/Contents/MacOS/soffice'
-            ]
-            for path in possible_paths:
-                if os.path.exists(path):
-                    return path
-
-            raise ConvertToPdfError(
-                "LibreOffice not found. Please install it:\n"
-                "  - Ubuntu/Debian: sudo apt-get install libreoffice\n"
-                "  - CentOS/RHEL: sudo yum install libreoffice\n"
-                "  - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
-                "  - Or ensure soffice is in your PATH environment variable."
-            )
-        except Exception as e:
-            raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
-
-
-def convert_file_to_pdf(input_path, output_dir):
-    """Convert a single document (ppt, doc, etc.) to PDF."""
-    if not os.path.isfile(input_path):
-        raise FileNotFoundError(f"The input file {input_path} does not exist.")
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    check_fonts_installed()
-
-    soffice_cmd = get_soffice_command()
-
-    cmd = [
-        soffice_cmd,
-        '--headless',
-        '--norestore',
-        '--invisible',
-        '--convert-to', 'pdf',
-        '--outdir', str(output_dir),
-        str(input_path)
-    ]
-
-    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
-    if process.returncode != 0:
-        raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")
--- a/magic-pdf.template.json
+++ b/magic-pdf.template.json
@@ -3,23 +3,6 @@
        "bucket-name-1":["ak", "sk", "endpoint"],
        "bucket-name-2":["ak", "sk", "endpoint"]
    },
-    "models-dir":"/tmp/models",
-    "layoutreader-model-dir":"/tmp/layoutreader",
-    "device-mode":"cpu",
-    "layout-config": {
-        "model": "doclayout_yolo"
-    },
-    "formula-config": {
-        "mfd_model": "yolo_v8_mfd",
-        "mfr_model": "unimernet_small",
-        "enable": true
-    },
-    "table-config": {
-        "model": "rapid_table",
-        "sub_model": "slanet_plus",
-        "enable": true,
-        "max_time": 400
-    },
    "latex-delimiter-config": {
        "display": {
            "left": "$$",
@@ -31,18 +14,6 @@
        }
    },
    "llm-aided-config": {
-        "formula_aided": {
-            "api_key": "your_api_key",
-            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-            "model": "qwen2.5-7b-instruct",
-            "enable": false
-        },
-        "text_aided": {
-            "api_key": "your_api_key",
-            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-            "model": "qwen2.5-7b-instruct",
-            "enable": false
-        },
        "title_aided": {
            "api_key": "your_api_key",
            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
@@ -50,5 +21,9 @@
            "enable": false
        }
    },
-    "config_version": "1.2.1"
+    "models-dir": {
+        "pipeline": "",
+        "vlm": ""
+    },
+    "config_version": "1.3.0"
 }
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/language_detection/__init__.py
+++ b/magic_pdf/model/sub_modules/language_detection/__init__.py
--- a/magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py
+++ b/magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py