refactor: rename init file and update app.py to enable parsing method

bd927919 · myhloli · f5016508 · f5016508 · f5016508 · f5016508
Commit bd927919 authored May 27, 2025 by myhloli
20 changed files
--- a/magic_pdf/libs/convert_utils.py
+++ b/magic_pdf/libs/convert_utils.py
-def dict_to_list(input_dict):
-    items_list = []
-    for _, item in input_dict.items():
-        items_list.append(item)
-    return items_list
--- a/magic_pdf/libs/coordinate_transform.py
+++ b/magic_pdf/libs/coordinate_transform.py
-def get_scale_ratio(model_page_info, page):
-    pix = page.get_pixmap(dpi=72)
-    pymu_width = int(pix.w)
-    pymu_height = int(pix.h)
-    width_from_json = model_page_info['page_info']['width']
-    height_from_json = model_page_info['page_info']['height']
-    horizontal_scale_ratio = width_from_json / pymu_width
-    vertical_scale_ratio = height_from_json / pymu_height
-    return horizontal_scale_ratio, vertical_scale_ratio
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
-import fitz
-from magic_pdf.config.constants import CROSS_PAGE
-from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
-                                               ContentType)
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.model.magic_model import MagicModel
-def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
-    new_rgb = []
-    for item in rgb_config:
-        item = float(item) / 255
-        new_rgb.append(item)
-    page_data = bbox_list[i]
-    for bbox in page_data:
-        x0, y0, x1, y1 = bbox
-        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
-        if fill_config:
-            page.draw_rect(
-                rect_coords,
-                color=None,
-                fill=new_rgb,
-                fill_opacity=0.3,
-                width=0.5,
-                overlay=True,
-            )  # Draw the rectangle
-        else:
-            page.draw_rect(
-                rect_coords,
-                color=new_rgb,
-                fill=None,
-                fill_opacity=1,
-                width=0.5,
-                overlay=True,
-            )  # Draw the rectangle
-def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
-    new_rgb = []
-    for item in rgb_config:
-        item = float(item) / 255
-        new_rgb.append(item)
-    page_data = bbox_list[i]
-    for j, bbox in enumerate(page_data):
-        x0, y0, x1, y1 = bbox
-        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
-        if draw_bbox:
-            if fill_config:
-                page.draw_rect(
-                    rect_coords,
-                    color=None,
-                    fill=new_rgb,
-                    fill_opacity=0.3,
-                    width=0.5,
-                    overlay=True,
-                )  # Draw the rectangle
-            else:
-                page.draw_rect(
-                    rect_coords,
-                    color=new_rgb,
-                    fill=None,
-                    fill_opacity=1,
-                    width=0.5,
-                    overlay=True,
-                )  # Draw the rectangle
-        page.insert_text(
-            (x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
-        )  # Insert the index in the top left corner of the rectangle
-def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
-    dropped_bbox_list = []
-    tables_list, tables_body_list = [], []
-    tables_caption_list, tables_footnote_list = [], []
-    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
-    imgs_footnote_list = []
-    titles_list = []
-    texts_list = []
-    interequations_list = []
-    lists_list = []
-    indexs_list = []
-    for page in pdf_info:
-        page_dropped_list = []
-        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
-        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
-        titles = []
-        texts = []
-        interequations = []
-        lists = []
-        indices = []
-        for dropped_bbox in page['discarded_blocks']:
-            page_dropped_list.append(dropped_bbox['bbox'])
-        dropped_bbox_list.append(page_dropped_list)
-        for block in page['para_blocks']:
-            bbox = block['bbox']
-            if block['type'] == BlockType.Table:
-                tables.append(bbox)
-                for nested_block in block['blocks']:
-                    bbox = nested_block['bbox']
-                    if nested_block['type'] == BlockType.TableBody:
-                        tables_body.append(bbox)
-                    elif nested_block['type'] == BlockType.TableCaption:
-                        tables_caption.append(bbox)
-                    elif nested_block['type'] == BlockType.TableFootnote:
-                        tables_footnote.append(bbox)
-            elif block['type'] == BlockType.Image:
-                imgs.append(bbox)
-                for nested_block in block['blocks']:
-                    bbox = nested_block['bbox']
-                    if nested_block['type'] == BlockType.ImageBody:
-                        imgs_body.append(bbox)
-                    elif nested_block['type'] == BlockType.ImageCaption:
-                        imgs_caption.append(bbox)
-                    elif nested_block['type'] == BlockType.ImageFootnote:
-                        imgs_footnote.append(bbox)
-            elif block['type'] == BlockType.Title:
-                titles.append(bbox)
-            elif block['type'] == BlockType.Text:
-                texts.append(bbox)
-            elif block['type'] == BlockType.InterlineEquation:
-                interequations.append(bbox)
-            elif block['type'] == BlockType.List:
-                lists.append(bbox)
-            elif block['type'] == BlockType.Index:
-                indices.append(bbox)
-        tables_list.append(tables)
-        tables_body_list.append(tables_body)
-        tables_caption_list.append(tables_caption)
-        tables_footnote_list.append(tables_footnote)
-        imgs_list.append(imgs)
-        imgs_body_list.append(imgs_body)
-        imgs_caption_list.append(imgs_caption)
-        imgs_footnote_list.append(imgs_footnote)
-        titles_list.append(titles)
-        texts_list.append(texts)
-        interequations_list.append(interequations)
-        lists_list.append(lists)
-        indexs_list.append(indices)
-    layout_bbox_list = []
-    table_type_order = {
-        'table_caption': 1,
-        'table_body': 2,
-        'table_footnote': 3
-    }
-    for page in pdf_info:
-        page_block_list = []
-        for block in page['para_blocks']:
-            if block['type'] in [
-                BlockType.Text,
-                BlockType.Title,
-                BlockType.InterlineEquation,
-                BlockType.List,
-                BlockType.Index,
-            ]:
-                bbox = block['bbox']
-                page_block_list.append(bbox)
-            elif block['type'] in [BlockType.Image]:
-                for sub_block in block['blocks']:
-                    bbox = sub_block['bbox']
-                    page_block_list.append(bbox)
-            elif block['type'] in [BlockType.Table]:
-                sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
-                for sub_block in sorted_blocks:
-                    bbox = sub_block['bbox']
-                    page_block_list.append(bbox)
-        layout_bbox_list.append(page_block_list)
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    for i, page in enumerate(pdf_docs):
-        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
-        # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True)  # color !
-        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
-        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
-        draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
-        # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
-        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
-        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
-        draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
-        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
-        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
-        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
-        draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
-        draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
-        draw_bbox_with_number(
-            i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
-        )
-    # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}')
-def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
-    text_list = []
-    inline_equation_list = []
-    interline_equation_list = []
-    image_list = []
-    table_list = []
-    dropped_list = []
-    next_page_text_list = []
-    next_page_inline_equation_list = []
-    def get_span_info(span):
-        if span['type'] == ContentType.Text:
-            if span.get(CROSS_PAGE, False):
-                next_page_text_list.append(span['bbox'])
-            else:
-                page_text_list.append(span['bbox'])
-        elif span['type'] == ContentType.InlineEquation:
-            if span.get(CROSS_PAGE, False):
-                next_page_inline_equation_list.append(span['bbox'])
-            else:
-                page_inline_equation_list.append(span['bbox'])
-        elif span['type'] == ContentType.InterlineEquation:
-            page_interline_equation_list.append(span['bbox'])
-        elif span['type'] == ContentType.Image:
-            page_image_list.append(span['bbox'])
-        elif span['type'] == ContentType.Table:
-            page_table_list.append(span['bbox'])
-    for page in pdf_info:
-        page_text_list = []
-        page_inline_equation_list = []
-        page_interline_equation_list = []
-        page_image_list = []
-        page_table_list = []
-        page_dropped_list = []
-        # 将跨页的span放到移动到下一页的列表中
-        if len(next_page_text_list) > 0:
-            page_text_list.extend(next_page_text_list)
-            next_page_text_list.clear()
-        if len(next_page_inline_equation_list) > 0:
-            page_inline_equation_list.extend(next_page_inline_equation_list)
-            next_page_inline_equation_list.clear()
-        # 构造dropped_list
-        for block in page['discarded_blocks']:
-            if block['type'] == BlockType.Discarded:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        page_dropped_list.append(span['bbox'])
-        dropped_list.append(page_dropped_list)
-        # 构造其余useful_list
-        # for block in page['para_blocks']:  # span直接用分段合并前的结果就可以
-        for block in page['preproc_blocks']:
-            if block['type'] in [
-                BlockType.Text,
-                BlockType.Title,
-                BlockType.InterlineEquation,
-                BlockType.List,
-                BlockType.Index,
-            ]:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        get_span_info(span)
-            elif block['type'] in [BlockType.Image, BlockType.Table]:
-                for sub_block in block['blocks']:
-                    for line in sub_block['lines']:
-                        for span in line['spans']:
-                            get_span_info(span)
-        text_list.append(page_text_list)
-        inline_equation_list.append(page_inline_equation_list)
-        interline_equation_list.append(page_interline_equation_list)
-        image_list.append(page_image_list)
-        table_list.append(page_table_list)
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    for i, page in enumerate(pdf_docs):
-        # 获取当前页面的数据
-        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
-        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
-        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
-        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
-        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
-        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
-    # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}')
-def draw_model_bbox(model_list, dataset: Dataset, out_path, filename):
-    dropped_bbox_list = []
-    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
-    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
-    titles_list = []
-    texts_list = []
-    interequations_list = []
-    magic_model = MagicModel(model_list, dataset)
-    for i in range(len(model_list)):
-        page_dropped_list = []
-        tables_body, tables_caption, tables_footnote = [], [], []
-        imgs_body, imgs_caption, imgs_footnote = [], [], []
-        titles = []
-        texts = []
-        interequations = []
-        page_info = magic_model.get_model_list(i)
-        layout_dets = page_info['layout_dets']
-        for layout_det in layout_dets:
-            bbox = layout_det['bbox']
-            if layout_det['category_id'] == CategoryId.Text:
-                texts.append(bbox)
-            elif layout_det['category_id'] == CategoryId.Title:
-                titles.append(bbox)
-            elif layout_det['category_id'] == CategoryId.TableBody:
-                tables_body.append(bbox)
-            elif layout_det['category_id'] == CategoryId.TableCaption:
-                tables_caption.append(bbox)
-            elif layout_det['category_id'] == CategoryId.TableFootnote:
-                tables_footnote.append(bbox)
-            elif layout_det['category_id'] == CategoryId.ImageBody:
-                imgs_body.append(bbox)
-            elif layout_det['category_id'] == CategoryId.ImageCaption:
-                imgs_caption.append(bbox)
-            elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
-                interequations.append(bbox)
-            elif layout_det['category_id'] == CategoryId.Abandon:
-                page_dropped_list.append(bbox)
-            elif layout_det['category_id'] == CategoryId.ImageFootnote:
-                imgs_footnote.append(bbox)
-        tables_body_list.append(tables_body)
-        tables_caption_list.append(tables_caption)
-        tables_footnote_list.append(tables_footnote)
-        imgs_body_list.append(imgs_body)
-        imgs_caption_list.append(imgs_caption)
-        titles_list.append(titles)
-        texts_list.append(texts)
-        interequations_list.append(interequations)
-        dropped_bbox_list.append(page_dropped_list)
-        imgs_footnote_list.append(imgs_footnote)
-    for i in range(len(dataset)):
-        page = dataset.get_page(i)
-        draw_bbox_with_number(
-            i, dropped_bbox_list, page, [158, 158, 158], True
-        )  # color !
-        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
-        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
-        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
-        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
-        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
-        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
-        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
-        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
-        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
-    # Save the PDF
-    dataset.dump_to_file(f'{out_path}/{filename}')
-def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
-    layout_bbox_list = []
-    for page in pdf_info:
-        page_line_list = []
-        for block in page['preproc_blocks']:
-            if block['type'] in [BlockType.Text]:
-                for line in block['lines']:
-                    bbox = line['bbox']
-                    index = line['index']
-                    page_line_list.append({'index': index, 'bbox': bbox})
-            elif block['type'] in [BlockType.Title, BlockType.InterlineEquation]:
-                if 'virtual_lines' in block:
-                    if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
-                        for line in block['virtual_lines']:
-                            bbox = line['bbox']
-                            index = line['index']
-                            page_line_list.append({'index': index, 'bbox': bbox})
-                else:
-                    for line in block['lines']:
-                        bbox = line['bbox']
-                        index = line['index']
-                        page_line_list.append({'index': index, 'bbox': bbox})
-            elif block['type'] in [BlockType.Image, BlockType.Table]:
-                for sub_block in block['blocks']:
-                    if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
-                        if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
-                            for line in sub_block['virtual_lines']:
-                                bbox = line['bbox']
-                                index = line['index']
-                                page_line_list.append({'index': index, 'bbox': bbox})
-                        else:
-                            for line in sub_block['lines']:
-                                bbox = line['bbox']
-                                index = line['index']
-                                page_line_list.append({'index': index, 'bbox': bbox})
-                    elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
-                        for line in sub_block['lines']:
-                            bbox = line['bbox']
-                            index = line['index']
-                            page_line_list.append({'index': index, 'bbox': bbox})
-        sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
-        layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    for i, page in enumerate(pdf_docs):
-        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
-    pdf_docs.save(f'{out_path}/{filename}')
-def draw_char_bbox(pdf_bytes, out_path, filename):
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    for i, page in enumerate(pdf_docs):
-        for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']:
-            for line in block['lines']:
-                for span in line['spans']:
-                    for char in span['chars']:
-                        char_bbox = char['bbox']
-                        page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,)
-    pdf_docs.save(f'{out_path}/{filename}')
--- a/magic_pdf/libs/hash_utils.py
+++ b/magic_pdf/libs/hash_utils.py
-import hashlib
-def compute_md5(file_bytes):
-    hasher = hashlib.md5()
-    hasher.update(file_bytes)
-    return hasher.hexdigest().upper()
-def compute_sha256(input_string):
-    hasher = hashlib.sha256()
-    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
-    input_bytes = input_string.encode('utf-8')
-    hasher.update(input_bytes)
-    return hasher.hexdigest()
--- a/magic_pdf/libs/json_compressor.py
+++ b/magic_pdf/libs/json_compressor.py
-import json
-import brotli
-import base64
-class JsonCompressor:
-    @staticmethod
-    def compress_json(data):
-        """
-        Compress a json object and encode it with base64
-        """
-        json_str = json.dumps(data)
-        json_bytes = json_str.encode('utf-8')
-        compressed = brotli.compress(json_bytes, quality=6)
-        compressed_str = base64.b64encode(compressed).decode('utf-8')  # convert bytes to string
-        return compressed_str
-    @staticmethod
-    def decompress_json(compressed_str):
-        """
-        Decode the base64 string and decompress the json object
-        """
-        compressed = base64.b64decode(compressed_str.encode('utf-8'))  # convert string to bytes
-        decompressed_bytes = brotli.decompress(compressed)
-        json_str = decompressed_bytes.decode('utf-8')
-        data = json.loads(json_str)
-        return data
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
-import os
-import unicodedata
-if not os.getenv("FTLANG_CACHE"):
-    current_file_path = os.path.abspath(__file__)
-    current_dir = os.path.dirname(current_file_path)
-    root_dir = os.path.dirname(current_dir)
-    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
-    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
-    # print(os.getenv("FTLANG_CACHE"))
-from fast_langdetect import detect_language
-def remove_invalid_surrogates(text):
-    # 移除无效的 UTF-16 代理对
-    return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF))
-def detect_lang(text: str) -> str:
-    if len(text) == 0:
-        return ""
-    text = text.replace("\n", "")
-    text = remove_invalid_surrogates(text)
-    # print(text)
-    try:
-        lang_upper = detect_language(text)
-    except:
-        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
-        lang_upper = detect_language(html_no_ctrl_chars)
-    try:
-        lang = lang_upper.lower()
-    except:
-        lang = ""
-    return lang
-if __name__ == '__main__':
-    print(os.getenv("FTLANG_CACHE"))
-    print(detect_lang("This is a test."))
-    print(detect_lang("<html>This is a test</html>"))
-    print(detect_lang("这个是中文测试。"))
-    print(detect_lang("<html>这个是中文测试。</html>"))
-    print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试"))
\ No newline at end of file
--- a/magic_pdf/libs/local_math.py
+++ b/magic_pdf/libs/local_math.py
-def float_gt(a, b):
-    if 0.0001 >= abs(a -b):
-        return False
-    return a > b
-def float_equal(a, b):
-    if 0.0001 >= abs(a-b):
-        return True
-    return False
\ No newline at end of file
--- a/magic_pdf/libs/markdown_utils.py
+++ b/magic_pdf/libs/markdown_utils.py
-def ocr_escape_special_markdown_char(content):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for char in special_chars:
-        content = content.replace(char, "\\" + char)
-    return content
--- a/magic_pdf/libs/path_utils.py
+++ b/magic_pdf/libs/path_utils.py
-def remove_non_official_s3_args(s3path):
-    """
-    example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
-    """
-    arr = s3path.split("?")
-    return arr[0]
-def parse_s3path(s3path: str):
-    # from s3pathlib import S3Path
-    # p = S3Path(remove_non_official_s3_args(s3path))
-    # return p.bucket, p.key
-    s3path = remove_non_official_s3_args(s3path).strip()
-    if s3path.startswith(('s3://', 's3a://')):
-        prefix, path = s3path.split('://', 1)
-        bucket_name, key = path.split('/', 1)
-        return bucket_name, key
-    elif s3path.startswith('/'):
-        raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
-    else:
-        raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
-def parse_s3_range_params(s3path: str):
-    """
-    example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
-    """
-    arr = s3path.split("?bytes=")
-    if len(arr) == 1:
-        return None
-    return arr[1].split(",")
--- a/magic_pdf/libs/pdf_check.py
+++ b/magic_pdf/libs/pdf_check.py
-import fitz
-import numpy as np
-from loguru import logger
-import re
-from io import BytesIO
-from pdfminer.high_level import extract_text
-from pdfminer.layout import LAParams
-def calculate_sample_count(total_page: int):
-    """
-    根据总页数和采样率计算采样页面的数量。
-    """
-    select_page_cnt = min(10, total_page)
-    return select_page_cnt
-def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
-    pdf_docs = fitz.open("pdf", src_pdf_bytes)
-    total_page = len(pdf_docs)
-    if total_page == 0:
-        # 如果PDF没有页面，直接返回空文档
-        logger.warning("PDF is empty, return empty document")
-        return fitz.Document()
-    select_page_cnt = calculate_sample_count(total_page)
-    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
-    sample_docs = fitz.Document()
-    try:
-        for index in page_num:
-            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
-    except Exception as e:
-        logger.exception(e)
-    return sample_docs
-def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
-    """"
-    检测PDF中是否包含非法字符
-    """
-    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
-    sample_docs = extract_pages(src_pdf_bytes)
-    sample_pdf_bytes = sample_docs.tobytes()
-    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-    laparams = LAParams(
-        line_overlap=0.5,
-        char_margin=2.0,
-        line_margin=0.5,
-        word_margin=0.1,
-        boxes_flow=None,
-        detect_vertical=False,
-        all_texts=False,
-    )
-    text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
-    text = text.replace("\n", "")
-    # logger.info(text)
-    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
-    cid_pattern = re.compile(r'\(cid:\d+\)')
-    matches = cid_pattern.findall(text)
-    cid_count = len(matches)
-    cid_len = sum(len(match) for match in matches)
-    text_len = len(text)
-    if text_len == 0:
-        cid_chars_radio = 0
-    else:
-        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
-    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
-    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
-    if cid_chars_radio > 0.05:
-        return False  # 乱码文档
-    else:
-        return True   # 正常文档
-def count_replacement_characters(text: str) -> int:
-    """
-    统计字符串中 0xfffd 字符的数量。
-    """
-    return text.count('\ufffd')
-def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
-    sample_docs = extract_pages(src_pdf_bytes)
-    doc_text = ""
-    for page in sample_docs:
-        page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
-        doc_text += page_text
-    text_len = len(doc_text)
-    uffd_count = count_replacement_characters(doc_text)
-    if text_len == 0:
-        uffd_chars_radio = 0
-    else:
-        uffd_chars_radio = uffd_count / text_len
-    logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
-    '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
-    if uffd_chars_radio > 0.01:
-        return False  # 乱码文档
-    else:
-        return True   # 正常文档
\ No newline at end of file
--- a/magic_pdf/libs/pdf_image_tools.py
+++ b/magic_pdf/libs/pdf_image_tools.py
-from io import BytesIO
-import cv2
-import fitz
-import numpy as np
-from PIL import Image
-from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.hash_utils import compute_sha256
-def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: DataWriter):
-    """从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径 save_path：需要同时支持s3和本地,
-    图片存放在save_path下，文件名是:
-    {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
-    # 拼接文件名
-    filename = f'{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}'
-    # 老版本返回不带bucket的路径
-    img_path = join_path(return_path, filename) if return_path is not None else None
-    # 新版本生成平铺路径
-    img_hash256_path = f'{compute_sha256(img_path)}.jpg'
-    # 将坐标转换为fitz.Rect对象
-    rect = fitz.Rect(*bbox)
-    # 配置缩放倍数为3倍
-    zoom = fitz.Matrix(3, 3)
-    # 截取图片
-    pix = page.get_pixmap(clip=rect, matrix=zoom)
-    byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
-    imageWriter.write(img_hash256_path, byte_data)
-    return img_hash256_path
-def cut_image_to_pil_image(bbox: tuple, page: fitz.Page, mode="pillow"):
-    # 将坐标转换为fitz.Rect对象
-    rect = fitz.Rect(*bbox)
-    # 配置缩放倍数为3倍
-    zoom = fitz.Matrix(3, 3)
-    # 截取图片
-    pix = page.get_pixmap(clip=rect, matrix=zoom)
-    if mode == "cv2":
-        # 直接转换为numpy数组供cv2使用
-        img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
-        # PyMuPDF使用RGB顺序，而cv2使用BGR顺序
-        if pix.n == 3 or pix.n == 4:
-            image_result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
-        else:
-            image_result = img_array
-    elif mode == "pillow":
-        # 将字节数据转换为文件对象
-        image_file = BytesIO(pix.tobytes(output='png'))
-        # 使用 Pillow 打开图像
-        image_result = Image.open(image_file)
-    else:
-        raise ValueError(f"mode: {mode} is not supported.")
-    return image_result
\ No newline at end of file
--- a/magic_pdf/libs/performance_stats.py
+++ b/magic_pdf/libs/performance_stats.py
-import time
-import functools
-from collections import defaultdict
-from typing import Dict, List
-class PerformanceStats:
-    """性能统计类，用于收集和展示方法执行时间"""
-    _stats: Dict[str, List[float]] = defaultdict(list)
-    @classmethod
-    def add_execution_time(cls, func_name: str, execution_time: float):
-        """添加执行时间记录"""
-        cls._stats[func_name].append(execution_time)
-    @classmethod
-    def get_stats(cls) -> Dict[str, dict]:
-        """获取统计结果"""
-        results = {}
-        for func_name, times in cls._stats.items():
-            results[func_name] = {
-                'count': len(times),
-                'total_time': sum(times),
-                'avg_time': sum(times) / len(times),
-                'min_time': min(times),
-                'max_time': max(times)
-            }
-        return results
-    @classmethod
-    def print_stats(cls):
-        """打印统计结果"""
-        stats = cls.get_stats()
-        print("\n性能统计结果:")
-        print("-" * 80)
-        print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
-        print("-" * 80)
-        for func_name, data in stats.items():
-            print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
-def measure_time(func):
-    """测量方法执行时间的装饰器"""
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        start_time = time.time()
-        result = func(*args, **kwargs)
-        execution_time = time.time() - start_time
-        # 获取更详细的函数标识
-        if hasattr(func, "__self__"):  # 实例方法
-            class_name = func.__self__.__class__.__name__
-            full_name = f"{class_name}.{func.__name__}"
-        elif hasattr(func, "__qualname__"):  # 类方法或静态方法
-            full_name = func.__qualname__
-        else:
-            module_name = func.__module__
-            full_name = f"{module_name}.{func.__name__}"
-        PerformanceStats.add_execution_time(full_name, execution_time)
-        return result
-    return wrapper
\ No newline at end of file
--- a/magic_pdf/libs/safe_filename.py
+++ b/magic_pdf/libs/safe_filename.py
-import os
-def sanitize_filename(filename, replacement="_"):
-    if os.name == 'nt':
-        invalid_chars = '<>:"|?*'
-        for char in invalid_chars:
-            filename = filename.replace(char, replacement)
-    return filename
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
-__version__ = "1.3.12"
--- a/magic_pdf/model/__init__.py
+++ b/magic_pdf/model/__init__.py
-__use_inside_model__ = True
-__model_mode__ = 'full'
\ No newline at end of file
--- a/magic_pdf/model/batch_analyze.py
+++ b/magic_pdf/model/batch_analyze.py
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
--- a/magic_pdf/model/model_list.py
+++ b/magic_pdf/model/model_list.py
-class MODEL:
-    Paddle = "pp_structure_v2"
-    PEK = "pdf_extract_kit"
-class AtomicModel:
-    Layout = "layout"
-    MFD = "mfd"
-    MFR = "mfr"
-    OCR = "ocr"
-    Table = "table"
-    LangDetect = "langdetect"
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py