Merge pull request #13 from myhloli/refactor-mineru2

refactor: refactor-mineru2

Merge pull request #13 from myhloli/refactor-mineru2
refactor: refactor-mineru2
88495c32 · Xiaomeng Zhao · GitHub · ddf5a878 · d96d9161 · ddf5a878
Unverified Commit 88495c32 authored Jun 11, 2025 by Xiaomeng Zhao Committed by GitHub Jun 11, 2025
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
-repos:
-  - repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
-    hooks:
-      - id: flake8
-        args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.11.5
-    hooks:
-      - id: isort
-  - repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.32.0
-    hooks:
-      - id: yapf
-        args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
-  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.1
-    hooks:
-      - id: codespell
-        args: ['--skip', '*.json']
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
-    hooks:
-      - id: trailing-whitespace
-      - id: check-yaml
-      - id: end-of-file-fixer
-      - id: requirements-txt-fixer
-      - id: double-quote-string-fixer
-      - id: check-merge-conflict
-      - id: fix-encoding-pragma
-        args: ["--remove"]
-      - id: mixed-line-ending
-        args: ["--fix=lf"]
-  - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.9
-    hooks:
-      - id: mdformat
-        args: ["--number", "--table-width", "200"]
-        additional_dependencies:
-          - mdformat-openmmlab
-          - mdformat_frontmatter
-          - linkify-it-py
-  - repo: https://github.com/myint/docformatter
-    rev: v1.3.1
-    hooks:
-      - id: docformatter
-        args: ["--in-place", "--wrap-descriptions", "119"]
--- a/magic_pdf/config/constants.py
+++ b/magic_pdf/config/constants.py
-"""span维度自定义字段."""
-# span是否是跨页合并的
-CROSS_PAGE = 'cross_page'
-
-"""
-block维度自定义字段
-"""
-# block中lines是否被删除
-LINES_DELETED = 'lines_deleted'
-
-# table recognition max time default value
-TABLE_MAX_TIME_VALUE = 400
-
-# pp_table_result_max_length
-TABLE_MAX_LEN = 480
-
-# table master structure dict
-TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
-
-# table master dir
-TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
-
-# pp detect model dir
-DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
-
-# pp rec model dir
-REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
-
-# pp rec char dict path
-REC_CHAR_DICT = 'ppocr_keys_v1.txt'
-
-# pp rec copy rec directory
-PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
-
-# pp rec copy det directory
-PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
-
-
-class MODEL_NAME:
-    # pp table structure algorithm
-    TABLE_MASTER = 'tablemaster'
-    # struct eqtable
-    STRUCT_EQTABLE = 'struct_eqtable'
-
-    DocLayout_YOLO = 'doclayout_yolo'
-
-    LAYOUTLMv3 = 'layoutlmv3'
-
-    YOLO_V8_MFD = 'yolo_v8_mfd'
-
-    UniMerNet_v2_Small = 'unimernet_small'
-
-    RAPID_TABLE = 'rapid_table'
-
-    YOLO_V11_LangDetect = 'yolo_v11n_langdetect'
-
-
-PARSE_TYPE_TXT = 'txt'
-PARSE_TYPE_OCR = 'ocr'
-
--- a/magic_pdf/config/drop_reason.py
+++ b/magic_pdf/config/drop_reason.py
-class DropReason:
-    TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap'  # 文字块有水平互相覆盖，导致无法准确定位文字顺序
-    USEFUL_BLOCK_HOR_OVERLAP = (
-        'useful_block_horizontal_overlap'  # 需保留的block水平覆盖
-    )
-    COMPLICATED_LAYOUT = 'complicated_layout'  # 复杂的布局，暂时不支持
-    TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns'  # 目前不支持分栏超过2列的
-    COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box'  # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
-    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
-        'high_computational_load_by_imgs'  # 含特殊图片，计算量太大，从而丢弃
-    )
-    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
-        'high_computational_load_by_svgs'  # 特殊的SVG图，计算量太大，从而丢弃
-    )
-    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages'  # 计算量超过负荷，当前方法下计算量消耗过大
-    MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result'  # 版面分析失败
-    Exception = '_exception'  # 解析中发生异常
-    ENCRYPTED = 'encrypted'  # PDF是加密的
-    EMPTY_PDF = 'total_page=0'  # PDF页面总数为0
-    NOT_IS_TEXT_PDF = 'not_is_text_pdf'  # 不是文字版PDF，无法直接解析
-    DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block'  # 无法清晰的分段
-    TITLE_DETECTION_FAILED = 'title_detection_failed'  # 探测标题失败
-    TITLE_LEVEL_FAILED = (
-        'title_level_failed'  # 分析标题级别失败（例如一级、二级、三级标题）
-    )
-    PARA_SPLIT_FAILED = 'para_split_failed'  # 识别段落失败
-    PARA_MERGE_FAILED = 'para_merge_failed'  # 段落合并失败
-    NOT_ALLOW_LANGUAGE = 'not_allow_language'  # 不支持的语种
-    SPECIAL_PDF = 'special_pdf'
-    PSEUDO_SINGLE_COLUMN = 'pseudo_single_column'  # 无法精确判断文字分栏
-    CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout'  # 无法分析页面的版面
-    NEGATIVE_BBOX_AREA = 'negative_bbox_area'  # 缩放导致 bbox 面积为负
-    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
-        'overlap_blocks_can_t_separation'  # 无法分离重叠的block
-    )
--- a/magic_pdf/config/drop_tag.py
+++ b/magic_pdf/config/drop_tag.py
-
-COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
-PAGE_NO = 'page-no'  # 页码
-CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area'  # 页眉页脚内的文本
-VERTICAL_TEXT = 'vertical-text'  # 垂直文本
-ROTATE_TEXT = 'rotate-text'  # 旋转文本
-EMPTY_SIDE_BLOCK = 'empty-side-block'  # 边缘上的空白没有任何内容的block
-ON_IMAGE_TEXT = 'on-image-text'  # 文本在图片上
-ON_TABLE_TEXT = 'on-table-text'  # 文本在表格上
-
-
-class DropTag:
-    PAGE_NUMBER = 'page_no'
-    HEADER = 'header'
-    FOOTER = 'footer'
-    FOOTNOTE = 'footnote'
-    NOT_IN_LAYOUT = 'not_in_layout'
-    SPAN_OVERLAP = 'span_overlap'
-    BLOCK_OVERLAP = 'block_overlap'
--- a/magic_pdf/config/enums.py
+++ b/magic_pdf/config/enums.py
-
-import enum
-
-
-class SupportedPdfParseMethod(enum.Enum):
-    OCR = 'ocr'
-    TXT = 'txt'
--- a/magic_pdf/config/make_content_config.py
+++ b/magic_pdf/config/make_content_config.py
-class MakeMode:
-    MM_MD = 'mm_markdown'
-    NLP_MD = 'nlp_markdown'
-    STANDARD_FORMAT = 'standard_format'
-
-
-class DropMode:
-    WHOLE_PDF = 'whole_pdf'
-    SINGLE_PAGE = 'single_page'
-    NONE = 'none'
-    NONE_WITH_REASON = 'none_with_reason'
--- a/magic_pdf/config/model_block_type.py
+++ b/magic_pdf/config/model_block_type.py
-from enum import Enum
-
-
-class ModelBlockTypeEnum(Enum):
-    TITLE = 0
-    PLAIN_TEXT = 1
-    ABANDON = 2
-    ISOLATE_FORMULA = 8
-    EMBEDDING = 13
-    ISOLATED = 14
--- a/magic_pdf/config/ocr_content_type.py
+++ b/magic_pdf/config/ocr_content_type.py
-class ContentType:
-    Image = 'image'
-    Table = 'table'
-    Text = 'text'
-    InlineEquation = 'inline_equation'
-    InterlineEquation = 'interline_equation'
-
-
-class BlockType:
-    Image = 'image'
-    ImageBody = 'image_body'
-    ImageCaption = 'image_caption'
-    ImageFootnote = 'image_footnote'
-    Table = 'table'
-    TableBody = 'table_body'
-    TableCaption = 'table_caption'
-    TableFootnote = 'table_footnote'
-    Text = 'text'
-    Title = 'title'
-    InterlineEquation = 'interline_equation'
-    Footnote = 'footnote'
-    Discarded = 'discarded'
-    List = 'list'
-    Index = 'index'
-
-
-class CategoryId:
-    Title = 0
-    Text = 1
-    Abandon = 2
-    ImageBody = 3
-    ImageCaption = 4
-    TableBody = 5
-    TableCaption = 6
-    TableFootnote = 7
-    InterlineEquation_Layout = 8
-    InlineEquation = 13
-    InterlineEquation_YOLO = 14
-    OcrText = 15
-    ImageFootnote = 101
--- a/magic_pdf/data/batch_build_dataset.py
+++ b/magic_pdf/data/batch_build_dataset.py
-import concurrent.futures
-
-import fitz
-
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.data.utils import fitz_doc_to_image  # PyMuPDF
-
-
-def partition_array_greedy(arr, k):
-    """Partition an array into k parts using a simple greedy approach.
-
-    Parameters:
-    -----------
-    arr : list
-        The input array of integers
-    k : int
-        Number of partitions to create
-
-    Returns:
-    --------
-    partitions : list of lists
-        The k partitions of the array
-    """
-    # Handle edge cases
-    if k <= 0:
-        raise ValueError('k must be a positive integer')
-    if k > len(arr):
-        k = len(arr)  # Adjust k if it's too large
-    if k == 1:
-        return [list(range(len(arr)))]
-    if k == len(arr):
-        return [[i] for i in range(len(arr))]
-
-    # Sort the array in descending order
-    sorted_indices = sorted(range(len(arr)), key=lambda i: arr[i][1], reverse=True)
-
-    # Initialize k empty partitions
-    partitions = [[] for _ in range(k)]
-    partition_sums = [0] * k
-
-    # Assign each element to the partition with the smallest current sum
-    for idx in sorted_indices:
-        # Find the partition with the smallest sum
-        min_sum_idx = partition_sums.index(min(partition_sums))
-
-        # Add the element to this partition
-        partitions[min_sum_idx].append(idx)  # Store the original index
-        partition_sums[min_sum_idx] += arr[idx][1]
-
-    return partitions
-
-
-def process_pdf_batch(pdf_jobs, idx):
-    """Process a batch of PDF pages using multiple threads.
-
-    Parameters:
-    -----------
-    pdf_jobs : list of tuples
-        List of (pdf_path, page_num) tuples
-    output_dir : str or None
-        Directory to save images to
-    num_threads : int
-        Number of threads to use
-    **kwargs :
-        Additional arguments for process_pdf_page
-
-    Returns:
-    --------
-    images : list
-        List of processed images
-    """
-    images = []
-
-    for pdf_path, _ in pdf_jobs:
-        doc = fitz.open(pdf_path)
-        tmp = []
-        for page_num in range(len(doc)):
-            page = doc[page_num]
-            tmp.append(fitz_doc_to_image(page))
-        images.append(tmp)
-    return (idx, images)
-
-
-def batch_build_dataset(pdf_paths, k, lang=None):
-    """Process multiple PDFs by partitioning them into k balanced parts and
-    processing each part in parallel.
-
-    Parameters:
-    -----------
-    pdf_paths : list
-        List of paths to PDF files
-    k : int
-        Number of partitions to create
-    output_dir : str or None
-        Directory to save images to
-    threads_per_worker : int
-        Number of threads to use per worker
-    **kwargs :
-        Additional arguments for process_pdf_page
-
-    Returns:
-    --------
-    all_images : list
-        List of all processed images
-    """
-
-    results = []
-    for pdf_path in pdf_paths:
-        with open(pdf_path, 'rb') as f:
-            pdf_bytes = f.read()
-        dataset = PymuDocDataset(pdf_bytes, lang=lang)
-        results.append(dataset)
-    return results
-
-
-    #
-    # # Get page counts for each PDF
-    # pdf_info = []
-    # total_pages = 0
-    #
-    # for pdf_path in pdf_paths:
-    #     try:
-    #         doc = fitz.open(pdf_path)
-    #         num_pages = len(doc)
-    #         pdf_info.append((pdf_path, num_pages))
-    #         total_pages += num_pages
-    #         doc.close()
-    #     except Exception as e:
-    #         print(f'Error opening {pdf_path}: {e}')
-    #
-    # # Partition the jobs based on page countEach job has 1 page
-    # partitions = partition_array_greedy(pdf_info, k)
-    #
-    # # Process each partition in parallel
-    # all_images_h = {}
-    #
-    # with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
-    #     # Submit one task per partition
-    #     futures = []
-    #     for sn, partition in enumerate(partitions):
-    #         # Get the jobs for this partition
-    #         partition_jobs = [pdf_info[idx] for idx in partition]
-    #
-    #         # Submit the task
-    #         future = executor.submit(
-    #             process_pdf_batch,
-    #             partition_jobs,
-    #             sn
-    #         )
-    #         futures.append(future)
-    #     # Process results as they complete
-    #     for i, future in enumerate(concurrent.futures.as_completed(futures)):
-    #         try:
-    #             idx, images = future.result()
-    #             all_images_h[idx] = images
-    #         except Exception as e:
-    #             print(f'Error processing partition: {e}')
-    # results = [None] * len(pdf_paths)
-    # for i in range(len(partitions)):
-    #     partition = partitions[i]
-    #     for j in range(len(partition)):
-    #         with open(pdf_info[partition[j]][0], 'rb') as f:
-    #             pdf_bytes = f.read()
-    #         dataset = PymuDocDataset(pdf_bytes, lang=lang)
-    #         dataset.set_images(all_images_h[i][j])
-    #         results[partition[j]] = dataset
-    # return results
\ No newline at end of file
--- a/magic_pdf/data/data_reader_writer/__init__.py
+++ b/magic_pdf/data/data_reader_writer/__init__.py
-from magic_pdf.data.data_reader_writer.filebase import \
-    FileBasedDataReader  # noqa: F401
-from magic_pdf.data.data_reader_writer.filebase import \
-    FileBasedDataWriter  # noqa: F401
-from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
-    MultiBucketS3DataReader  # noqa: F401
-from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
-    MultiBucketS3DataWriter  # noqa: F401
-from magic_pdf.data.data_reader_writer.s3 import S3DataReader  # noqa: F401
-from magic_pdf.data.data_reader_writer.s3 import S3DataWriter  # noqa: F401
-from magic_pdf.data.data_reader_writer.base import DataReader  # noqa: F401
-from magic_pdf.data.data_reader_writer.base import DataWriter  # noqa: F401
\ No newline at end of file
--- a/magic_pdf/data/dataset.py
+++ b/magic_pdf/data/dataset.py
-import os
-from abc import ABC, abstractmethod
-from typing import Callable, Iterator
-
-import fitz
-from loguru import logger
-
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.data.schemas import PageInfo
-from magic_pdf.data.utils import fitz_doc_to_image
-from magic_pdf.filter import classify
-
-
-class PageableData(ABC):
-    @abstractmethod
-    def get_image(self) -> dict:
-        """Transform data to image."""
-        pass
-
-    @abstractmethod
-    def get_doc(self) -> fitz.Page:
-        """Get the pymudoc page."""
-        pass
-
-    @abstractmethod
-    def get_page_info(self) -> PageInfo:
-        """Get the page info of the page.
-
-        Returns:
-            PageInfo: the page info of this page
-        """
-        pass
-
-    @abstractmethod
-    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
-        """draw rectangle.
-
-        Args:
-            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
-            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
-            fill (list[float] | None): fill the board with RGB, None means will not fill with color
-            fill_opacity (float): opacity of the fill, range from [0, 1]
-            width (float): the width of board
-            overlay (bool): fill the color in foreground or background. True means fill in background.
-        """
-        pass
-
-    @abstractmethod
-    def insert_text(self, coord, content, fontsize, color):
-        """insert text.
-
-        Args:
-            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
-            content (str): the text content
-            fontsize (int): font size of the text
-            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
-        """
-        pass
-
-
-class Dataset(ABC):
-    @abstractmethod
-    def __len__(self) -> int:
-        """The length of the dataset."""
-        pass
-
-    @abstractmethod
-    def __iter__(self) -> Iterator[PageableData]:
-        """Yield the page data."""
-        pass
-
-    @abstractmethod
-    def supported_methods(self) -> list[SupportedPdfParseMethod]:
-        """The methods that this dataset support.
-
-        Returns:
-            list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
-        """
-        pass
-
-    @abstractmethod
-    def data_bits(self) -> bytes:
-        """The bits used to create this dataset."""
-        pass
-
-    @abstractmethod
-    def get_page(self, page_id: int) -> PageableData:
-        """Get the page indexed by page_id.
-
-        Args:
-            page_id (int): the index of the page
-
-        Returns:
-            PageableData: the page doc object
-        """
-        pass
-
-    @abstractmethod
-    def dump_to_file(self, file_path: str):
-        """Dump the file.
-
-        Args:
-            file_path (str): the file path
-        """
-        pass
-
-    @abstractmethod
-    def apply(self, proc: Callable, *args, **kwargs):
-        """Apply callable method which.
-
-        Args:
-            proc (Callable): invoke proc as follows:
-                proc(self, *args, **kwargs)
-
-        Returns:
-            Any: return the result generated by proc
-        """
-        pass
-
-    @abstractmethod
-    def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset.
-
-        Returns:
-            SupportedPdfParseMethod: _description_
-        """
-        pass
-
-    @abstractmethod
-    def clone(self):
-        """clone this dataset."""
-        pass
-
-
-class PymuDocDataset(Dataset):
-    def __init__(self, bits: bytes, lang=None):
-        """Initialize the dataset, which wraps the pymudoc documents.
-
-        Args:
-            bits (bytes): the bytes of the pdf
-        """
-        self._raw_fitz = fitz.open('pdf', bits)
-        self._records = [Doc(v) for v in self._raw_fitz]
-        self._data_bits = bits
-        self._raw_data = bits
-        self._classify_result = None
-
-        if lang == '':
-            self._lang = None
-        elif lang == 'auto':
-            from magic_pdf.model.sub_modules.language_detection.utils import \
-                auto_detect_lang
-            self._lang = auto_detect_lang(self._data_bits)
-            logger.info(f'lang: {lang}, detect_lang: {self._lang}')
-        else:
-            self._lang = lang
-            logger.info(f'lang: {lang}')
-
-    def __len__(self) -> int:
-        """The page number of the pdf."""
-        return len(self._records)
-
-    def __iter__(self) -> Iterator[PageableData]:
-        """Yield the page doc object."""
-        return iter(self._records)
-
-    def supported_methods(self) -> list[SupportedPdfParseMethod]:
-        """The method supported by this dataset.
-
-        Returns:
-            list[SupportedPdfParseMethod]: the supported methods
-        """
-        return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT]
-
-    def data_bits(self) -> bytes:
-        """The pdf bits used to create this dataset."""
-        return self._data_bits
-
-    def get_page(self, page_id: int) -> PageableData:
-        """The page doc object.
-
-        Args:
-            page_id (int): the page doc index
-
-        Returns:
-            PageableData: the page doc object
-        """
-        return self._records[page_id]
-
-    def dump_to_file(self, file_path: str):
-        """Dump the file.
-
-        Args:
-            file_path (str): the file path
-        """
-
-        dir_name = os.path.dirname(file_path)
-        if dir_name not in ('', '.', '..'):
-            os.makedirs(dir_name, exist_ok=True)
-        self._raw_fitz.save(file_path)
-
-    def apply(self, proc: Callable, *args, **kwargs):
-        """Apply callable method which.
-
-        Args:
-            proc (Callable): invoke proc as follows:
-                proc(dataset, *args, **kwargs)
-
-        Returns:
-            Any: return the result generated by proc
-        """
-        if 'lang' in kwargs and self._lang is not None:
-            kwargs['lang'] = self._lang
-        return proc(self, *args, **kwargs)
-
-    def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset.
-
-        Returns:
-            SupportedPdfParseMethod: _description_
-        """
-        if self._classify_result is None:
-            self._classify_result = classify(self._data_bits)
-        return self._classify_result
-
-    def clone(self):
-        """clone this dataset."""
-        return PymuDocDataset(self._raw_data)
-
-    def set_images(self, images):
-        for i in range(len(self._records)):
-            self._records[i].set_image(images[i])
-
-class ImageDataset(Dataset):
-    def __init__(self, bits: bytes, lang=None):
-        """Initialize the dataset, which wraps the pymudoc documents.
-
-        Args:
-            bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
-        """
-        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-        self._raw_fitz = fitz.open('pdf', pdf_bytes)
-        self._records = [Doc(v) for v in self._raw_fitz]
-        self._raw_data = bits
-        self._data_bits = pdf_bytes
-
-        if lang == '':
-            self._lang = None
-        elif lang == 'auto':
-            from magic_pdf.model.sub_modules.language_detection.utils import \
-                auto_detect_lang
-            self._lang = auto_detect_lang(self._data_bits)
-            logger.info(f'lang: {lang}, detect_lang: {self._lang}')
-        else:
-            self._lang = lang
-            logger.info(f'lang: {lang}')
-
-    def __len__(self) -> int:
-        """The length of the dataset."""
-        return len(self._records)
-
-    def __iter__(self) -> Iterator[PageableData]:
-        """Yield the page object."""
-        return iter(self._records)
-
-    def supported_methods(self):
-        """The method supported by this dataset.
-
-        Returns:
-            list[SupportedPdfParseMethod]: the supported methods
-        """
-        return [SupportedPdfParseMethod.OCR]
-
-    def data_bits(self) -> bytes:
-        """The pdf bits used to create this dataset."""
-        return self._data_bits
-
-    def get_page(self, page_id: int) -> PageableData:
-        """The page doc object.
-
-        Args:
-            page_id (int): the page doc index
-
-        Returns:
-            PageableData: the page doc object
-        """
-        return self._records[page_id]
-
-    def dump_to_file(self, file_path: str):
-        """Dump the file.
-
-        Args:
-            file_path (str): the file path
-        """
-        dir_name = os.path.dirname(file_path)
-        if dir_name not in ('', '.', '..'):
-            os.makedirs(dir_name, exist_ok=True)
-        self._raw_fitz.save(file_path)
-
-    def apply(self, proc: Callable, *args, **kwargs):
-        """Apply callable method which.
-
-        Args:
-            proc (Callable): invoke proc as follows:
-                proc(dataset, *args, **kwargs)
-
-        Returns:
-            Any: return the result generated by proc
-        """
-        return proc(self, *args, **kwargs)
-
-    def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset.
-
-        Returns:
-            SupportedPdfParseMethod: _description_
-        """
-        return SupportedPdfParseMethod.OCR
-
-    def clone(self):
-        """clone this dataset."""
-        return ImageDataset(self._raw_data)
-
-    def set_images(self, images):
-        for i in range(len(self._records)):
-            self._records[i].set_image(images[i])
-
-class Doc(PageableData):
-    """Initialized with pymudoc object."""
-
-    def __init__(self, doc: fitz.Page):
-        self._doc = doc
-        self._img = None
-
-    def get_image(self):
-        """Return the image info.
-
-        Returns:
-            dict: {
-                img: np.ndarray,
-                width: int,
-                height: int
-            }
-        """
-        if self._img is None:
-            self._img = fitz_doc_to_image(self._doc)
-        return self._img
-
-    def set_image(self, img):
-        """
-        Args:
-            img (np.ndarray): the image
-        """
-        if self._img is None:
-            self._img = img
-
-    def get_doc(self) -> fitz.Page:
-        """Get the pymudoc object.
-
-        Returns:
-            fitz.Page: the pymudoc object
-        """
-        return self._doc
-
-    def get_page_info(self) -> PageInfo:
-        """Get the page info of the page.
-
-        Returns:
-            PageInfo: the page info of this page
-        """
-        page_w = self._doc.rect.width
-        page_h = self._doc.rect.height
-        return PageInfo(w=page_w, h=page_h)
-
-    def __getattr__(self, name):
-        if hasattr(self._doc, name):
-            return getattr(self._doc, name)
-
-    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
-        """draw rectangle.
-
-        Args:
-            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
-            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
-            fill (list[float] | None): fill the board with RGB, None means will not fill with color
-            fill_opacity (float): opacity of the fill, range from [0, 1]
-            width (float): the width of board
-            overlay (bool): fill the color in foreground or background. True means fill in background.
-        """
-        self._doc.draw_rect(
-            rect_coords,
-            color=color,
-            fill=fill,
-            fill_opacity=fill_opacity,
-            width=width,
-            overlay=overlay,
-        )
-
-    def insert_text(self, coord, content, fontsize, color):
-        """insert text.
-
-        Args:
-            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
-            content (str): the text content
-            fontsize (int): font size of the text
-            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
-        """
-        self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
\ No newline at end of file
--- a/magic_pdf/data/io/__init__.py
+++ b/magic_pdf/data/io/__init__.py
-
-from magic_pdf.data.io.base import IOReader, IOWriter  # noqa: F401
-from magic_pdf.data.io.http import HttpReader, HttpWriter  # noqa: F401
-from magic_pdf.data.io.s3 import S3Reader, S3Writer  # noqa: F401
-
-__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
\ No newline at end of file
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
-import json
-import os
-import tempfile
-import shutil
-from pathlib import Path
-
-from magic_pdf.config.exceptions import EmptyData, InvalidParams
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               MultiBucketS3DataReader)
-from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
-from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
-
-def read_jsonl(
-    s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
-) -> list[PymuDocDataset]:
-    """Read the jsonl file and return the list of PymuDocDataset.
-
-    Args:
-        s3_path_or_local (str): local file or s3 path
-        s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
-
-    Raises:
-        InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
-        EmptyData: if no pdf file location is provided in some line of jsonl file.
-        InvalidParams: if the file location is s3 path but s3_client is not provided
-
-    Returns:
-        list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
-    """
-    bits_arr = []
-    if s3_path_or_local.startswith('s3://'):
-        if s3_client is None:
-            raise InvalidParams('s3_client is required when s3_path is provided')
-        jsonl_bits = s3_client.read(s3_path_or_local)
-    else:
-        jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
-    jsonl_d = [
-        json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
-    ]
-    for d in jsonl_d:
-        pdf_path = d.get('file_location', '') or d.get('path', '')
-        if len(pdf_path) == 0:
-            raise EmptyData('pdf file location is empty')
-        if pdf_path.startswith('s3://'):
-            if s3_client is None:
-                raise InvalidParams('s3_client is required when s3_path is provided')
-            bits_arr.append(s3_client.read(pdf_path))
-        else:
-            bits_arr.append(FileBasedDataReader('').read(pdf_path))
-    return [PymuDocDataset(bits) for bits in bits_arr]
-
-
-def read_local_pdfs(path: str) -> list[PymuDocDataset]:
-    """Read pdf from path or directory.
-
-    Args:
-        path (str): pdf file path or directory that contains pdf files
-
-    Returns:
-        list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
-    """
-    if os.path.isdir(path):
-        reader = FileBasedDataReader()
-        ret = []
-        for root, _, files in os.walk(path):
-            for file in files:
-                suffix = file.split('.')
-                if suffix[-1] == 'pdf':
-                    ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
-        return ret
-    else:
-        reader = FileBasedDataReader()
-        bits = reader.read(path)
-        return [PymuDocDataset(bits)]
-
-def read_local_office(path: str) -> list[PymuDocDataset]:
-    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
-
-    Args:
-        path (str): ms-office file or directory that contains ms-office files
-
-    Returns:
-        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
-        
-    Raises:
-        ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
-        FileNotFoundError: File not Found
-        Exception: Unknown Exception raised
-    """
-    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
-    fns = []
-    ret = []
-    if os.path.isdir(path):
-        for root, _, files in os.walk(path):
-            for file in files:
-                suffix = Path(file).suffix
-                if suffix in suffixes:
-                    fns.append((os.path.join(root, file)))
-    else:
-        fns.append(path)
-        
-    reader = FileBasedDataReader()
-    temp_dir = tempfile.mkdtemp()
-    for fn in fns:
-        try:
-            convert_file_to_pdf(fn, temp_dir)
-        except ConvertToPdfError as e:
-            raise e
-        except FileNotFoundError as e:
-            raise e
-        except Exception as e:
-            raise e
-        fn_path = Path(fn)
-        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
-        ret.append(PymuDocDataset(reader.read(pdf_fn)))
-    shutil.rmtree(temp_dir)
-    return ret
-
-def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg', '.jpeg']) -> list[ImageDataset]:
-    """Read images from path or directory.
-
-    Args:
-        path (str): image file path or directory that contains image files
-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
-
-    Returns:
-        list[ImageDataset]: each image file will converted to a ImageDataset
-    """
-    if os.path.isdir(path):
-        imgs_bits = []
-        s_suffixes = set(suffixes)
-        reader = FileBasedDataReader()
-        for root, _, files in os.walk(path):
-            for file in files:
-                suffix = Path(file).suffix
-                if suffix in s_suffixes:
-                    imgs_bits.append(reader.read(os.path.join(root, file)))
-        return [ImageDataset(bits) for bits in imgs_bits]
-    else:
-        reader = FileBasedDataReader()
-        bits = reader.read(path)
-        return [ImageDataset(bits)]
--- a/magic_pdf/data/utils.py
+++ b/magic_pdf/data/utils.py
-
-import multiprocessing as mp
-import threading
-from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
-                                as_completed)
-
-import fitz
-import numpy as np
-from loguru import logger
-
-
-
-def fitz_doc_to_image(page, dpi=200) -> dict:
-    """Convert fitz.Document to image, Then convert the image to numpy array.
-
-    Args:
-        page (_type_): pymudoc page
-        dpi (int, optional): reset the dpi of dpi. Defaults to 200.
-
-    Returns:
-        dict:  {'img': numpy array, 'width': width, 'height': height }
-    """
-    mat = fitz.Matrix(dpi / 72, dpi / 72)
-    pm = page.get_pixmap(matrix=mat, alpha=False)
-
-    # If the width or height exceeds 4500 after scaling, do not scale further.
-    if pm.width > 4500 or pm.height > 4500:
-        pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-    # Convert pixmap samples directly to numpy array
-    img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
-
-    img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
-
-    return img_dict
-
-def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
-    images = []
-    with fitz.open('pdf', pdf_bytes) as doc:
-        pdf_page_num = doc.page_count
-        end_page_id = (
-            end_page_id
-            if end_page_id is not None and end_page_id >= 0
-            else pdf_page_num - 1
-        )
-        if end_page_id > pdf_page_num - 1:
-            logger.warning('end_page_id is out of range, use images length')
-            end_page_id = pdf_page_num - 1
-
-        for index in range(0, doc.page_count):
-            if start_page_id <= index <= end_page_id:
-                page = doc[index]
-                mat = fitz.Matrix(dpi / 72, dpi / 72)
-                pm = page.get_pixmap(matrix=mat, alpha=False)
-
-                # If the width or height exceeds 4500 after scaling, do not scale further.
-                if pm.width > 4500 or pm.height > 4500:
-                    pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-                # Convert pixmap samples directly to numpy array
-                img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
-
-                img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
-            else:
-                img_dict = {'img': [], 'width': 0, 'height': 0}
-
-            images.append(img_dict)
-    return images
-
-
-def convert_page(bytes_page):
-    pdfs = fitz.open('pdf', bytes_page)
-    page = pdfs[0]
-    return fitz_doc_to_image(page)
-
-def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
-    """Process PDF pages in parallel with serialization-safe approach."""
-    if num_workers is None:
-        num_workers = mp.cpu_count()
-
-
-    # Process the extracted page data in parallel
-    with ProcessPoolExecutor(max_workers=num_workers) as executor:
-        # Process the page data
-        results = list(
-            executor.map(convert_page, pages)
-        )
-
-    return results
-
-
-def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
-    """Process all pages of a PDF using multiple threads.
-
-    Parameters:
-    -----------
-    pdf_path : str
-        Path to the PDF file
-    num_threads : int
-        Number of threads to use
-    **kwargs :
-        Additional arguments for fitz_doc_to_image
-
-    Returns:
-    --------
-    images : list
-        List of processed images, in page order
-    """
-    # Open the PDF
-    doc = fitz.open(pdf_path)
-    num_pages = len(doc)
-
-    # Create a list to store results in the correct order
-    results = [None] * num_pages
-
-    # Create a thread pool
-    with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        # Submit all tasks
-        futures = {}
-        for page_num in range(num_pages):
-            page = doc[page_num]
-            future = executor.submit(fitz_doc_to_image, page, **kwargs)
-            futures[future] = page_num
-        # Process results as they complete with progress bar
-        for future in as_completed(futures):
-            page_num = futures[future]
-            try:
-                results[page_num] = future.result()
-            except Exception as e:
-                print(f'Error processing page {page_num}: {e}')
-                results[page_num] = None
-
-    # Close the document
-    doc.close()
-
-if __name__ == '__main__':
-    pdf = fitz.open('/tmp/[MS-DOC].pdf')
-
-
-    pdf_page = [fitz.open() for i in range(pdf.page_count)]
-    [pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)]
-
-    pdf_page = [v.tobytes() for v in pdf_page]
-    results = parallel_process_pdf_safe(pdf_page, num_workers=16)
-
-    # threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
-
-    """ benchmark results of multi-threaded processing (fitz page to image)
-    total page nums: 578
-    thread nums,    time cost
-    1               7.351 sec
-    2               6.334 sec
-    4               5.968 sec
-    8               6.728 sec
-    16              8.085 sec
-    """
-
-    """ benchmark results of multi-processor processing (fitz page to image)
-    total page nums: 578
-    processor nums,    time cost
-    1                  17.170 sec
-    2                  10.170 sec
-    4                  7.841 sec
-    8                  7.900 sec
-    16                 7.984 sec
-    """
--- a/magic_pdf/filter/__init__.py
+++ b/magic_pdf/filter/__init__.py
-
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
-from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
-
-
-def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
-    """根据pdf的元数据，判断是文本pdf，还是ocr pdf."""
-    pdf_meta = pdf_meta_scan(pdf_bytes)
-    if pdf_meta.get('_need_drop', False):  # 如果返回了需要丢弃的标志，则抛出异常
-        raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
-    else:
-        is_encrypted = pdf_meta['is_encrypted']
-        is_needs_password = pdf_meta['is_needs_password']
-        if is_encrypted or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-            raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
-        else:
-            is_text_pdf, results = do_classify(
-                pdf_meta['total_page'],
-                pdf_meta['page_width_pts'],
-                pdf_meta['page_height_pts'],
-                pdf_meta['image_info_per_page'],
-                pdf_meta['text_len_per_page'],
-                pdf_meta['imgs_per_page'],
-                # pdf_meta['text_layout_per_page'],
-                pdf_meta['invalid_chars'],
-            )
-            if is_text_pdf:
-                return SupportedPdfParseMethod.TXT
-            else:
-                return SupportedPdfParseMethod.OCR
--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
-"""
-根据利用meta_scan得到的结果，对pdf是否为文字版进行分类。
-定义标准：
-一、什么pdf会是文字pdf，只要满足以下任意一条
-  1. 随机抽取N页，如果有任何一页文字数目大于100
-  2. 只要存在一个页面，图片的数量为0
-二、什么是扫描版pdf，只要满足以下任意一条
-  1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~
-  2. 大部分页面上文字的长度都是相等的。
-
-"""
-import json
-import sys
-from collections import Counter
-
-import click
-import numpy as np
-from loguru import logger
-
-from magic_pdf.libs.commons import mymax, get_top_percent_list
-from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
-
-TEXT_LEN_THRESHOLD = 100
-AVG_TEXT_LEN_THRESHOLD = 100
-TEXT_LEN_SAMPLE_RATIO = 0.1  # 抽取0.1的页面进行文字长度统计
-
-
-# 一个拼接图片的方案，将某些特殊扫描版本的拆图拼成一张整图
-def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
-    # 先通过set去除所有bbox重叠的图片数据
-    image_list_result = []
-    for page_images in image_list:
-        page_result = []
-        dedup = set()
-        for img in page_images:
-            x0, y0, x1, y1, img_bojid = img
-            if (x0, y0, x1, y1) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
-                continue
-            else:
-                dedup.add((x0, y0, x1, y1))
-                page_result.append([x0, y0, x1, y1, img_bojid])
-        image_list_result.append(page_result)
-
-    # 接下来，将同一页可拼接的图片进行合并
-    merged_images = []
-    for page_images in image_list_result:
-        if not page_images:
-            continue
-
-        # 先将同一页的图片从上到下，从左到右进行排序
-        page_images.sort(key=lambda img: (img[1], img[0]))
-
-        merged = [page_images[0]]
-
-        for img in page_images[1:]:
-            x0, y0, x1, y1, imgid = img
-
-            last_img = merged[-1]
-            last_x0, last_y0, last_x1, last_y1, last_imgid = last_img
-
-            # 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件
-            full_width = abs(x1 - x0) >= page_width * 0.9
-            full_height = abs(y1 - y0) >= page_height * 0.9
-
-            # 如果宽达标，检测是否能竖着拼
-            if full_width:
-                # 竖着拼需要满足两个前提，左右边界各偏移不能超过 max_offset，第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
-                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
-                            last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
-
-            # 如果高达标，检测是否可以横着拼
-            if full_height:
-                # 横着拼需要满足两个前提，上下边界各偏移不能超过 max_offset，第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
-                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
-                            last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
-
-            # Check if the image can be merged with the last image
-            if (full_width and close1) or (full_height and close2):
-                # Merge the image with the last image
-                merged[-1] = [min(x0, last_x0), min(y0, last_y0),
-                              max(x1, last_x1), max(y1, last_y1), imgid]
-            else:
-                # Add the image as a new image
-                merged.append(img)
-
-        merged_images.append(merged)
-
-    return merged_images
-
-
-def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text_len_list: list):
-    """
-    80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False，否则返回True
-    :param pdf_path:
-    :param total_page:
-    :param page_width:
-    :param page_height:
-    :param img_sz_list:
-    :return:
-    """
-    # # 只要有一页没有图片，那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf，上面有一些空白页面，既没有图片也没有文字。
-    # if any([len(img_sz) == 0 for img_sz in img_sz_list]):  # 含有不含图片的页面
-    #     # 现在找到这些页面的index
-    #     empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0]
-    #     # 然后检查这些页面上是否有文字
-    #     text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0]
-    #     if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD:  # 没有图片，但是有文字，说明可能是个文字版，如果没有文字则无法判断，留给下一步,现在要求这页文字量超过一定阈值
-    #         return True
-
-    # 通过objid去掉重复出现10次以上的图片，这些图片是隐藏的透明图层，其特点是id都一样
-    # 先对每个id出现的次数做个统计
-    objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
-    # 再去掉出现次数大于10的
-    if total_page >= scan_max_page:  # 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
-        total_page = scan_max_page
-
-    repeat_threshold = 2  # 把bad_image的阈值设为2
-    # repeat_threshold = min(2, total_page)  # 当total_page为1时，repeat_threshold为1，会产生误判导致所有img变成bad_img
-    bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
-    # bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])]
-    # text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0]
-
-    # 特殊情况，一个文字版pdf，每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上
-    # fake_image_ids = [objid for objid in bad_image_objid if
-    #                   any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for
-    #                        x0, y0, x1, y1, _ in images])]  # 原来的代码，any里面恒为true了，原因？？？
-    # fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images
-    #                   if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9]
-
-    # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]):  # 这些透明图片所在的页面上有文字大于阈值
-    #     return True
-
-    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
-                   img_sz_list]  # 过滤掉重复出现的图片
-
-    # 有的扫描版会把一页图片拆成很多张，需要先把图拼起来再计算
-    img_sz_list = merge_images(img_sz_list, page_width, page_height)
-
-    # 计算每个页面上最大的图的面积，然后计算这个面积占页面面积的比例
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
-                               img_sz_list]
-    page_area = page_width * page_height
-    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
-    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
-
-    if len(max_image_area_per_page) >= 0.5 * total_page:  # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
-        # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层，其特点是id都一样
-        return False
-    else:
-        return True
-
-
-def classify_by_text_len(text_len_list: list, total_page: int):
-    """
-    随机抽取10%的页面，如果少于5个页面，那么就取全部页面。
-    查看页面上的文字长度，如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD，那么就是文字pdf
-    :param total_page:
-    :param text_len_list:
-    :return:
-    """
-    select_page_cnt = int(total_page * TEXT_LEN_SAMPLE_RATIO)  # 选取10%的页面
-    if select_page_cnt < 5:
-        select_page_cnt = total_page
-
-    # # 排除头尾各10页
-    # if total_page > 20:  # 如果总页数大于20
-    #     page_range = list(range(10, total_page - 10))  # 从第11页到倒数第11页
-    # else:
-    #     page_range = list(range(total_page))  # 否则选择所有页面
-    # page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False)
-    # 排除前后10页对只有21，22页的pdf很尴尬，如果选出来的中间那一两页恰好没字容易误判，有了avg_words规则，这个规则可以忽略
-    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
-    text_len_lst = [text_len_list[i] for i in page_num]
-    is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
-    return is_text_pdf
-
-
-def classify_by_avg_words(text_len_list: list):
-    """
-    补充规则，如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD，就不是文字pdf
-    主要是各种图集
-    :param text_len_list:
-    :return:
-    """
-    sum_words = sum(text_len_list)
-    count_of_numbers = len(text_len_list)
-    if count_of_numbers == 0:
-        is_text_pdf = False
-    else:
-        avg_words = round(sum_words / count_of_numbers)
-        if avg_words > AVG_TEXT_LEN_THRESHOLD:
-            is_text_pdf = True
-        else:
-            is_text_pdf = False
-
-    return is_text_pdf
-
-
-def classify_by_img_num(img_sz_list: list, img_num_list: list):
-    """
-    补充规则，有一种扫描版本的PDF，每一页都会放所有的扫描页进去，在 metascan 时会被去重，
-    这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素，img_num_list中每一页的数量都很大且相同
-    :param img_sz_list:
-    :param img_num_list:
-    :return:
-    """
-    # 计算img_sz_list中非空元素的个数
-    count_img_sz_list_not_none = sum(1 for item in img_sz_list if item)
-    # 获取前80%的元素
-    top_eighty_percent = get_top_percent_list(img_num_list, 0.8)
-    # img_sz_list中非空元素的个数小于1，前80%的元素都相等，且最大值大于等于junk_limit_min
-    if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
-
-        #拿max和min的值,用来判断list内的值是否全都相等
-        # min_imgs = min(img_num_list)
-        # max_imgs = max(img_num_list)
-        #
-        # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
-        return False  # 如果满足这个条件，一定不是文字版pdf
-    else:
-        return True  # 不满足这三个条件，可能是文字版pdf，通过其他规则判断
-
-
-def classify_by_text_layout(text_layout_per_page: list):
-    """
-    判断文本布局是否以竖排为主。
-
-    Args:
-        text_layout_per_page (list): 文本布局列表，列表中的每个元素表示一页的文本布局，
-                                     值为'vertical'表示竖排，值为'horizontal'表示横排。
-
-    Returns:
-        bool: 若文本布局以竖排为主，则返回False；否则返回True。
-    """
-    # 统计text_layout_per_page中竖排的个数
-    count_vertical = sum(1 for item in text_layout_per_page if item == 'vertical')
-    # 统计text_layout_per_page中横排的个数
-    count_horizontal = sum(1 for item in text_layout_per_page if item == 'horizontal')
-    # 计算text_layout_per_page中竖排的占比
-    known_layout_cnt = count_vertical + count_horizontal
-    if known_layout_cnt != 0:
-        ratio = count_vertical / known_layout_cnt
-        if ratio >= 0.5:  # 阈值设为0.5，适配3页里面有2页和两页里有一页的情况
-            return False  # 文本布局以竖排为主，认为不是文字版pdf
-        else:
-            return True  # 文本布局以横排为主，认为是文字版pdf
-    else:
-        return False  # 文本布局未知，默认认为不是文字版pdf
-
-
-def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
-    """
-    判断一页是否由细长条组成，有两个条件：
-    1. 图片的宽或高达到页面宽或高的90%，且长边需要是窄边长度的数倍以上
-    2. 整个页面所有的图片有80%以上满足条件1
-
-    Args:
-        page_width (float): 页面宽度
-        page_height (float): 页面高度
-        img_sz_list (list): 图片尺寸列表，每个元素为一个元组，表示图片的矩形区域和尺寸，形如(x0, y0, x1, y1, size)，其中(x0, y0)为矩形区域的左上角坐标，(x1, y1)为矩形区域的右下角坐标，size为图片的尺寸
-
-    Returns:
-        bool: 如果满足条件的页面的比例小于0.5，返回True，否则返回False
-    """
-
-    def is_narrow_strip(img):
-        x0, y0, x1, y1, _ = img
-        width, height = x1 - x0, y1 - y0
-        return any([
-            # 图片宽度大于等于页面宽度的90%，且宽度大于等于高度4倍
-            width >= page_width * 0.9 and width >= height * 4,
-            # 图片高度大于等于页面高度的90%，且高度大于等于宽度4倍
-            height >= page_height * 0.9 and height >= width * 4,
-        ])
-
-    # 初始化满足条件的页面数量
-    narrow_strip_pages_count = 0
-
-    # 遍历所有页面
-    for page_img_list in img_sz_list:
-        # 忽略空页面
-        if not page_img_list:
-            continue
-
-        # 计算页面中的图片总数
-        total_images = len(page_img_list)
-
-        # 计算页面中细长条图片的数量
-        narrow_strip_images_count = 0
-        for img in page_img_list:
-            if is_narrow_strip(img):
-                narrow_strip_images_count += 1
-        # 如果细长条图片的数量少于5，跳过
-        if narrow_strip_images_count < 5:
-            continue
-        else:
-            # 如果细长条图片的比例大于或等于0.8，增加满足条件的页面数量
-            if narrow_strip_images_count / total_images >= 0.8:
-                narrow_strip_pages_count += 1
-
-    # 计算满足条件的页面的比例
-    narrow_strip_pages_ratio = narrow_strip_pages_count / len(img_sz_list)
-
-    return narrow_strip_pages_ratio < 0.5
-
-
-def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
-             # text_layout_list: list,
-             invalid_chars: bool):
-    """
-    这里的图片和页面长度单位是pts
-    :param total_page:
-    :param text_len_list:
-    :param page_width:
-    :param page_height:
-    :param img_sz_list:
-    :param pdf_path:
-    :return:
-    """
-    results = {
-        'by_image_area': classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list),
-        'by_text_len': classify_by_text_len(text_len_list, total_page),
-        'by_avg_words': classify_by_avg_words(text_len_list),
-        'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
-        # 'by_text_layout': classify_by_text_layout(text_layout_list),
-        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
-        'by_invalid_chars': invalid_chars,
-    }
-
-    if all(results.values()):
-        return True, results
-    elif not any(results.values()):
-        return False, results
-    else:
-        logger.warning(
-            f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
-            f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
-            # f" by_text_layout: {results['by_text_layout']},"
-            f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
-            f" by_invalid_chars: {results['by_invalid_chars']}",
-            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
-        return False, results
-
-
-@click.command()
-@click.option("--json-file", type=str, help="pdf信息")
-def main(json_file):
-    if json_file is None:
-        print("json_file is None", file=sys.stderr)
-        exit(0)
-    try:
-        with open(json_file, "r") as f:
-            for l in f:
-                if l.strip() == "":
-                    continue
-                o = json.loads(l)
-                total_page = o["total_page"]
-                page_width = o["page_width_pts"]
-                page_height = o["page_height_pts"]
-                img_sz_list = o["image_info_per_page"]
-                text_len_list = o['text_len_per_page']
-                text_layout_list = o['text_layout_per_page']
-                pdf_path = o['pdf_path']
-                is_encrypted = o['is_encrypted']
-                is_needs_password = o['is_needs_password']
-                if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-                    continue
-                tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
-                o['is_text_pdf'] = tag
-                print(json.dumps(o, ensure_ascii=False))
-    except Exception as e:
-        print("ERROR: ", e, file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
-    # false = False
-    # true = True
-    # null = None
-    # o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}}
-    # o = json.loads(json.dumps(o))
-    # total_page = o["total_page"]
-    # page_width = o["page_width_pts"]
-    # page_height = o["page_height_pts"]
-    # img_sz_list = o["image_info_per_page"]
-    # text_len_list = o['text_len_per_page']
-    # pdf_path = o['pdf_path']
-    # is_encrypted = o['is_encrypted']
-    # is_needs_password = o['is_needs_password']
-    # if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-    #     print("加密的")
-    #     exit(0)
-    # tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list)
-    # o['is_text_pdf'] = tag
-    # print(json.dumps(o, ensure_ascii=False))
--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
-"""输入： s3路径，每行一个 输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置."""
-
-from collections import Counter
-
-import fitz
-from loguru import logger
-
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.commons import get_top_percent_list, mymax
-from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
-
-scan_max_page = 50
-junk_limit_min = 10
-
-
-def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
-    max_image_area_per_page = [
-        mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz])
-        for page_img_sz in result
-    ]
-    page_area = int(page_width_pts) * int(page_height_pts)
-    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
-    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
-    return max_image_area_per_page
-
-
-def process_image(page, junk_img_bojids=[]):
-    page_result = []  # 存每个页面里的多张图四元组信息
-    items = page.get_images()
-    dedup = set()
-    for img in items:
-        #  这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
-        img_bojid = img[
-            0
-        ]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
-        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
-            continue
-        recs = page.get_image_rects(img, transform=True)
-        if recs:
-            rec = recs[0][0]
-            x0, y0, x1, y1 = map(int, rec)
-            width = x1 - x0
-            height = y1 - y0
-            if (
-                x0,
-                y0,
-                x1,
-                y1,
-                img_bojid,
-            ) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
-                continue
-            if not all(
-                [width, height]
-            ):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
-                continue
-            dedup.add((x0, y0, x1, y1, img_bojid))
-            page_result.append([x0, y0, x1, y1, img_bojid])
-    return page_result
-
-
-def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
-    """返回每个页面里的图片的四元组，每个页面多个图片。
-
-    :param doc:
-    :return:
-    """
-    #  使用 Counter 计数 img_bojid 的出现次数
-    img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
-    #  找出出现次数超过 len(doc) 半数的 img_bojid
-
-    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
-
-    junk_img_bojids = [
-        img_bojid
-        for img_bojid, count in img_bojid_counter.items()
-        if count >= junk_limit
-    ]
-
-    #  todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
-    #  有两种扫描版，一种文字版，这里可能会有误判
-    #  扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
-    #  扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
-    # 文  字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
-    imgs_len_list = [len(page.get_images()) for page in doc]
-
-    special_limit_pages = 10
-
-    #  统一用前十页结果做判断
-    result = []
-    break_loop = False
-    for i, page in enumerate(doc):
-        if break_loop:
-            break
-        if i >= special_limit_pages:
-            break
-        page_result = process_image(
-            page
-        )  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
-        result.append(page_result)
-        for item in result:
-            if not any(
-                item
-            ):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
-                if (
-                    max(imgs_len_list) == min(imgs_len_list)
-                    and max(imgs_len_list) >= junk_limit_min
-                ):  # 如果是特殊文字版，就把junklist置空并break
-                    junk_img_bojids = []
-                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
-                    pass
-                break_loop = True
-                break
-    if not break_loop:
-        # 获取前80%的元素
-        top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
-        # 检查前80%的元素是否都相等
-        if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
-            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
-            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
-
-            # 前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
-            max_image_area_per_page = calculate_max_image_area_per_page(
-                result, page_width_pts, page_height_pts
-            )
-            if (
-                len(max_image_area_per_page) < 0.8 * special_limit_pages
-            ):  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
-                junk_img_bojids = []
-            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
-                pass
-        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
-            junk_img_bojids = []
-
-    # 正式进入取前50页图片的信息流程
-    result = []
-    for i, page in enumerate(doc):
-        if i >= scan_max_page:
-            break
-        page_result = process_image(page, junk_img_bojids)
-        # logger.info(f"page {i} img_len: {len(page_result)}")
-        result.append(page_result)
-
-    return result, junk_img_bojids
-
-
-def get_pdf_page_size_pts(doc: fitz.Document):
-    page_cnt = len(doc)
-    l: int = min(page_cnt, 50)
-    # 把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
-    page_width_list = []
-    page_height_list = []
-    for i in range(l):
-        page = doc[i]
-        page_rect = page.rect
-        page_width_list.append(page_rect.width)
-        page_height_list.append(page_rect.height)
-
-    page_width_list.sort()
-    page_height_list.sort()
-
-    median_width = page_width_list[len(page_width_list) // 2]
-    median_height = page_height_list[len(page_height_list) // 2]
-
-    return median_width, median_height
-
-
-def get_pdf_textlen_per_page(doc: fitz.Document):
-    text_len_lst = []
-    for page in doc:
-        # 拿包含img和text的所有blocks
-        # text_block = page.get_text("blocks")
-        # 拿所有text的blocks
-        # text_block = page.get_text("words")
-        # text_block_len = sum([len(t[4]) for t in text_block])
-        # 拿所有text的str
-        text_block = page.get_text('text')
-        text_block_len = len(text_block)
-        # logger.info(f"page {page.number} text_block_len: {text_block_len}")
-        text_len_lst.append(text_block_len)
-
-    return text_len_lst
-
-
-def get_pdf_text_layout_per_page(doc: fitz.Document):
-    """根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
-
-    Args:
-        doc (fitz.Document): PDF文档对象。
-
-    Returns:
-        List[str]: 每一页的文本布局（横向、纵向、未知）。
-    """
-    text_layout_list = []
-
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 创建每一页的纵向和横向的文本行数计数器
-        vertical_count = 0
-        horizontal_count = 0
-        text_dict = page.get_text('dict')
-        if 'blocks' in text_dict:
-            for block in text_dict['blocks']:
-                if 'lines' in block:
-                    for line in block['lines']:
-                        # 获取line的bbox顶点坐标
-                        x0, y0, x1, y1 = line['bbox']
-                        # 计算bbox的宽高
-                        width = x1 - x0
-                        height = y1 - y0
-                        # 计算bbox的面积
-                        area = width * height
-                        font_sizes = []
-                        for span in line['spans']:
-                            if 'size' in span:
-                                font_sizes.append(span['size'])
-                        if len(font_sizes) > 0:
-                            average_font_size = sum(font_sizes) / len(font_sizes)
-                        else:
-                            average_font_size = (
-                                10  # 有的line拿不到font_size，先定一个阈值100
-                            )
-                        if (
-                            area <= average_font_size**2
-                        ):  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
-                            continue
-                        else:
-                            if 'wmode' in line:  # 通过wmode判断文本方向
-                                if line['wmode'] == 1:  # 判断是否为竖向文本
-                                    vertical_count += 1
-                                elif line['wmode'] == 0:  # 判断是否为横向文本
-                                    horizontal_count += 1
-                        #     if 'dir' in line:  # 通过旋转角度计算判断文本方向
-                        #         # 获取行的 "dir" 值
-                        #         dir_value = line['dir']
-                        #         cosine, sine = dir_value
-                        #         # 计算角度
-                        #         angle = math.degrees(math.acos(cosine))
-                        #
-                        #         # 判断是否为横向文本
-                        #         if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is horizontal:', line_text)
-                        #             horizontal_count += 1
-                        #         # 判断是否为纵向文本
-                        #         elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is vertical:', line_text)
-                        #             vertical_count += 1
-        # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-        # 判断每一页的文本布局
-        if vertical_count == 0 and horizontal_count == 0:  # 该页没有文本，无法判断
-            text_layout_list.append('unknow')
-            continue
-        else:
-            if vertical_count > horizontal_count:  # 该页的文本纵向行数大于横向的
-                text_layout_list.append('vertical')
-            else:  # 该页的文本横向行数大于纵向的
-                text_layout_list.append('horizontal')
-        # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-    return text_layout_list
-
-
-"""定义一个自定义异常用来抛出单页svg太多的pdf"""
-
-
-class PageSvgsTooManyError(Exception):
-    def __init__(self, message='Page SVGs are too many'):
-        self.message = message
-        super().__init__(self.message)
-
-
-def get_svgs_per_page(doc: fitz.Document):
-    svgs_len_list = []
-    for page_id, page in enumerate(doc):
-        # svgs = page.get_drawings()
-        svgs = page.get_cdrawings()  # 切换成get_cdrawings，效率更高
-        len_svgs = len(svgs)
-        if len_svgs >= 3000:
-            raise PageSvgsTooManyError()
-        else:
-            svgs_len_list.append(len_svgs)
-        # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
-    return svgs_len_list
-
-
-def get_imgs_per_page(doc: fitz.Document):
-    imgs_len_list = []
-    for page_id, page in enumerate(doc):
-        imgs = page.get_images()
-        imgs_len_list.append(len(imgs))
-        # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
-
-    return imgs_len_list
-
-
-def get_language(doc: fitz.Document):
-    """
-    获取PDF文档的语言。
-    Args:
-        doc (fitz.Document): PDF文档对象。
-    Returns:
-        str: 文档语言，如 "en-US"。
-    """
-    language_lst = []
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 拿所有text的str
-        text_block = page.get_text('text')
-        page_language = detect_lang(text_block)
-        language_lst.append(page_language)
-
-        # logger.info(f"page_id: {page_id}, page_language: {page_language}")
-
-    # 统计text_language_list中每种语言的个数
-    count_dict = Counter(language_lst)
-    # 输出text_language_list中出现的次数最多的语言
-    language = max(count_dict, key=count_dict.get)
-    return language
-
-
-def check_invalid_chars(pdf_bytes):
-    """乱码检测."""
-    # return detect_invalid_chars_by_pymupdf(pdf_bytes)
-    return detect_invalid_chars(pdf_bytes)
-
-
-def pdf_meta_scan(pdf_bytes: bytes):
-    """
-    :param s3_pdf_path:
-    :param pdf_bytes: pdf文件的二进制数据
-    几个维度来评价：是否加密，是否需要密码，纸张大小，总页数，是否文字可提取
-    """
-    doc = fitz.open('pdf', pdf_bytes)
-    is_needs_password = doc.needs_pass
-    is_encrypted = doc.is_encrypted
-    total_page = len(doc)
-    if total_page == 0:
-        logger.warning(f'drop this pdf, drop_reason: {DropReason.EMPTY_PDF}')
-        result = {'_need_drop': True, '_drop_reason': DropReason.EMPTY_PDF}
-        return result
-    else:
-        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
-        # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
-
-        # svgs_per_page = get_svgs_per_page(doc)
-        # logger.info(f"svgs_per_page: {svgs_per_page}")
-        imgs_per_page = get_imgs_per_page(doc)
-        # logger.info(f"imgs_per_page: {imgs_per_page}")
-
-        image_info_per_page, junk_img_bojids = get_image_info(
-            doc, page_width_pts, page_height_pts
-        )
-        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
-        text_len_per_page = get_pdf_textlen_per_page(doc)
-        # logger.info(f"text_len_per_page: {text_len_per_page}")
-        # text_layout_per_page = get_pdf_text_layout_per_page(doc)
-        # logger.info(f"text_layout_per_page: {text_layout_per_page}")
-        # text_language = get_language(doc)
-        # logger.info(f"text_language: {text_language}")
-        invalid_chars = check_invalid_chars(pdf_bytes)
-        # logger.info(f"invalid_chars: {invalid_chars}")
-
-        # 最后输出一条json
-        res = {
-            'is_needs_password': is_needs_password,
-            'is_encrypted': is_encrypted,
-            'total_page': total_page,
-            'page_width_pts': int(page_width_pts),
-            'page_height_pts': int(page_height_pts),
-            'image_info_per_page': image_info_per_page,
-            'text_len_per_page': text_len_per_page,
-            # 'text_layout_per_page': text_layout_per_page,
-            # 'text_language': text_language,
-            # "svgs_per_page": svgs_per_page,
-            'imgs_per_page': imgs_per_page,  # 增加每页img数量list
-            'junk_img_bojids': junk_img_bojids,  # 增加垃圾图片的bojid list
-            'invalid_chars': invalid_chars,
-            'metadata': doc.metadata,
-        }
-        # logger.info(json.dumps(res, ensure_ascii=False))
-        return res
-
-
-if __name__ == '__main__':
-    pass
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
-    # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
-    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")  # noqa: E501
-    # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
-    # doc = fitz.open("pdf", file_content)
-    # text_layout_lst = get_pdf_text_layout_per_page(doc)
-    # print(text_layout_lst)
--- a/magic_pdf/integrations/__init__.py
+++ b/magic_pdf/integrations/__init__.py
--- a/magic_pdf/integrations/rag/__init__.py
+++ b/magic_pdf/integrations/rag/__init__.py
--- a/magic_pdf/integrations/rag/api.py
+++ b/magic_pdf/integrations/rag/api.py
-import os
-from pathlib import Path
-
-from loguru import logger
-
-from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
-                                             Node)
-from magic_pdf.integrations.rag.utils import inference
-
-
-class RagPageReader:
-
-    def __init__(self, pagedata: LayoutElements):
-        self.o = [
-            Node(
-                category_type=v.category_type,
-                text=v.text,
-                image_path=v.image_path,
-                anno_id=v.anno_id,
-                latex=v.latex,
-                html=v.html,
-            ) for v in pagedata.layout_dets
-        ]
-
-        self.pagedata = pagedata
-
-    def __iter__(self):
-        return iter(self.o)
-
-    def get_rel_map(self) -> list[ElementRelation]:
-        return self.pagedata.extra.element_relation
-
-
-class RagDocumentReader:
-
-    def __init__(self, ragdata: list[LayoutElements]):
-        self.o = [RagPageReader(v) for v in ragdata]
-
-    def __iter__(self):
-        return iter(self.o)
-
-
-class DataReader:
-
-    def __init__(self, path_or_directory: str, method: str, output_dir: str):
-        self.path_or_directory = path_or_directory
-        self.method = method
-        self.output_dir = output_dir
-        self.pdfs = []
-        if os.path.isdir(path_or_directory):
-            for doc_path in Path(path_or_directory).glob('*.pdf'):
-                self.pdfs.append(doc_path)
-        else:
-            assert path_or_directory.endswith('.pdf')
-            self.pdfs.append(Path(path_or_directory))
-
-    def get_documents_count(self) -> int:
-        """Returns the number of documents in the directory."""
-        return len(self.pdfs)
-
-    def get_document_result(self, idx: int) -> RagDocumentReader | None:
-        """
-        Args:
-            idx (int): the index of documents under the
-                directory path_or_directory
-
-        Returns:
-            RagDocumentReader | None: RagDocumentReader is an iterable object,
-            more details @RagDocumentReader
-        """
-        if idx >= self.get_documents_count() or idx < 0:
-            logger.error(f'invalid idx: {idx}')
-            return None
-        res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
-        if res is None:
-            logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
-            return None
-        return RagDocumentReader(res)
-
-    def get_document_filename(self, idx: int) -> Path:
-        """get the filename of the document."""
-        return self.pdfs[idx]