refactor: rename init file and update app.py to enable parsing method

bd927919 · myhloli · f5016508 · f5016508 · f5016508 · f5016508
Commit bd927919 authored May 27, 2025 by myhloli
20 changed files
--- a/magic_pdf/data/io/http.py
+++ b/magic_pdf/data/io/http.py
-
-import io
-
-import requests
-
-from magic_pdf.data.io.base import IOReader, IOWriter
-
-
-class HttpReader(IOReader):
-
-    def read(self, url: str) -> bytes:
-        """Read the file.
-
-        Args:
-            path (str): file path to read
-
-        Returns:
-            bytes: the content of the file
-        """
-        return requests.get(url).content
-
-    def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
-        """Not Implemented."""
-        raise NotImplementedError
-
-
-class HttpWriter(IOWriter):
-    def write(self, url: str, data: bytes) -> None:
-        """Write file with data.
-
-        Args:
-            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            data (bytes): the data want to write
-        """
-        files = {'file': io.BytesIO(data)}
-        response = requests.post(url, files=files)
-        assert 300 > response.status_code and response.status_code > 199
--- a/magic_pdf/data/io/s3.py
+++ b/magic_pdf/data/io/s3.py
-import boto3
-from botocore.config import Config
-
-from magic_pdf.data.io.base import IOReader, IOWriter
-
-
-class S3Reader(IOReader):
-    def __init__(
-        self,
-        bucket: str,
-        ak: str,
-        sk: str,
-        endpoint_url: str,
-        addressing_style: str = 'auto',
-    ):
-        """s3 reader client.
-
-        Args:
-            bucket (str): bucket name
-            ak (str): access key
-            sk (str): secret key
-            endpoint_url (str): endpoint url of s3
-            addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
-            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
-        """
-        self._bucket = bucket
-        self._ak = ak
-        self._sk = sk
-        self._s3_client = boto3.client(
-            service_name='s3',
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=endpoint_url,
-            config=Config(
-                s3={'addressing_style': addressing_style},
-                retries={'max_attempts': 5, 'mode': 'standard'},
-            ),
-        )
-
-    def read(self, key: str) -> bytes:
-        """Read the file.
-
-        Args:
-            path (str): file path to read
-
-        Returns:
-            bytes: the content of the file
-        """
-        return self.read_at(key)
-
-    def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes:
-        """Read at offset and limit.
-
-        Args:
-            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            offset (int, optional): the number of bytes skipped. Defaults to 0.
-            limit (int, optional): the length of bytes want to read. Defaults to -1.
-
-        Returns:
-            bytes: the content of file
-        """
-        if limit > -1:
-            range_header = f'bytes={offset}-{offset+limit-1}'
-            res = self._s3_client.get_object(
-                Bucket=self._bucket, Key=key, Range=range_header
-            )
-        else:
-            res = self._s3_client.get_object(
-                Bucket=self._bucket, Key=key, Range=f'bytes={offset}-'
-            )
-        return res['Body'].read()
-
-
-class S3Writer(IOWriter):
-    def __init__(
-        self,
-        bucket: str,
-        ak: str,
-        sk: str,
-        endpoint_url: str,
-        addressing_style: str = 'auto',
-    ):
-        """s3 reader client.
-
-        Args:
-            bucket (str): bucket name
-            ak (str): access key
-            sk (str): secret key
-            endpoint_url (str): endpoint url of s3
-            addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
-            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
-        """
-        self._bucket = bucket
-        self._ak = ak
-        self._sk = sk
-        self._s3_client = boto3.client(
-            service_name='s3',
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=endpoint_url,
-            config=Config(
-                s3={'addressing_style': addressing_style},
-                retries={'max_attempts': 5, 'mode': 'standard'},
-            ),
-        )
-
-    def write(self, key: str, data: bytes):
-        """Write file with data.
-
-        Args:
-            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            data (bytes): the data want to write
-        """
-        self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data)
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
-import json
-import os
-import tempfile
-import shutil
-from pathlib import Path
-
-from magic_pdf.config.exceptions import EmptyData, InvalidParams
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               MultiBucketS3DataReader)
-from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
-from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
-
-def read_jsonl(
-    s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
-) -> list[PymuDocDataset]:
-    """Read the jsonl file and return the list of PymuDocDataset.
-
-    Args:
-        s3_path_or_local (str): local file or s3 path
-        s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
-
-    Raises:
-        InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
-        EmptyData: if no pdf file location is provided in some line of jsonl file.
-        InvalidParams: if the file location is s3 path but s3_client is not provided
-
-    Returns:
-        list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
-    """
-    bits_arr = []
-    if s3_path_or_local.startswith('s3://'):
-        if s3_client is None:
-            raise InvalidParams('s3_client is required when s3_path is provided')
-        jsonl_bits = s3_client.read(s3_path_or_local)
-    else:
-        jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
-    jsonl_d = [
-        json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
-    ]
-    for d in jsonl_d:
-        pdf_path = d.get('file_location', '') or d.get('path', '')
-        if len(pdf_path) == 0:
-            raise EmptyData('pdf file location is empty')
-        if pdf_path.startswith('s3://'):
-            if s3_client is None:
-                raise InvalidParams('s3_client is required when s3_path is provided')
-            bits_arr.append(s3_client.read(pdf_path))
-        else:
-            bits_arr.append(FileBasedDataReader('').read(pdf_path))
-    return [PymuDocDataset(bits) for bits in bits_arr]
-
-
-def read_local_pdfs(path: str) -> list[PymuDocDataset]:
-    """Read pdf from path or directory.
-
-    Args:
-        path (str): pdf file path or directory that contains pdf files
-
-    Returns:
-        list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
-    """
-    if os.path.isdir(path):
-        reader = FileBasedDataReader()
-        ret = []
-        for root, _, files in os.walk(path):
-            for file in files:
-                suffix = file.split('.')
-                if suffix[-1] == 'pdf':
-                    ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
-        return ret
-    else:
-        reader = FileBasedDataReader()
-        bits = reader.read(path)
-        return [PymuDocDataset(bits)]
-
-def read_local_office(path: str) -> list[PymuDocDataset]:
-    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
-
-    Args:
-        path (str): ms-office file or directory that contains ms-office files
-
-    Returns:
-        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
-        
-    Raises:
-        ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
-        FileNotFoundError: File not Found
-        Exception: Unknown Exception raised
-    """
-    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
-    fns = []
-    ret = []
-    if os.path.isdir(path):
-        for root, _, files in os.walk(path):
-            for file in files:
-                suffix = Path(file).suffix
-                if suffix in suffixes:
-                    fns.append((os.path.join(root, file)))
-    else:
-        fns.append(path)
-        
-    reader = FileBasedDataReader()
-    temp_dir = tempfile.mkdtemp()
-    for fn in fns:
-        try:
-            convert_file_to_pdf(fn, temp_dir)
-        except ConvertToPdfError as e:
-            raise e
-        except FileNotFoundError as e:
-            raise e
-        except Exception as e:
-            raise e
-        fn_path = Path(fn)
-        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
-        ret.append(PymuDocDataset(reader.read(pdf_fn)))
-    shutil.rmtree(temp_dir)
-    return ret
-
-def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg', '.jpeg']) -> list[ImageDataset]:
-    """Read images from path or directory.
-
-    Args:
-        path (str): image file path or directory that contains image files
-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
-
-    Returns:
-        list[ImageDataset]: each image file will converted to a ImageDataset
-    """
-    if os.path.isdir(path):
-        imgs_bits = []
-        s_suffixes = set(suffixes)
-        reader = FileBasedDataReader()
-        for root, _, files in os.walk(path):
-            for file in files:
-                suffix = Path(file).suffix
-                if suffix in s_suffixes:
-                    imgs_bits.append(reader.read(os.path.join(root, file)))
-        return [ImageDataset(bits) for bits in imgs_bits]
-    else:
-        reader = FileBasedDataReader()
-        bits = reader.read(path)
-        return [ImageDataset(bits)]
--- a/magic_pdf/data/schemas.py
+++ b/magic_pdf/data/schemas.py
-
-from pydantic import BaseModel, Field
-
-
-class S3Config(BaseModel):
-    """S3 config
-    """
-    bucket_name: str = Field(description='s3 bucket name', min_length=1)
-    access_key: str = Field(description='s3 access key', min_length=1)
-    secret_key: str = Field(description='s3 secret key', min_length=1)
-    endpoint_url: str = Field(description='s3 endpoint url', min_length=1)
-    addressing_style: str = Field(description='s3 addressing style', default='auto', min_length=1)
-
-
-class PageInfo(BaseModel):
-    """The width and height of page
-    """
-    w: float = Field(description='the width of page')
-    h: float = Field(description='the height of page')
--- a/magic_pdf/data/utils.py
+++ b/magic_pdf/data/utils.py
-
-import multiprocessing as mp
-import threading
-from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
-                                as_completed)
-
-import fitz
-import numpy as np
-from loguru import logger
-
-
-
-def fitz_doc_to_image(page, dpi=200) -> dict:
-    """Convert fitz.Document to image, Then convert the image to numpy array.
-
-    Args:
-        page (_type_): pymudoc page
-        dpi (int, optional): reset the dpi of dpi. Defaults to 200.
-
-    Returns:
-        dict:  {'img': numpy array, 'width': width, 'height': height }
-    """
-    mat = fitz.Matrix(dpi / 72, dpi / 72)
-    pm = page.get_pixmap(matrix=mat, alpha=False)
-
-    # If the width or height exceeds 4500 after scaling, do not scale further.
-    if pm.width > 4500 or pm.height > 4500:
-        pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-    # Convert pixmap samples directly to numpy array
-    img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
-
-    img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
-
-    return img_dict
-
-def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
-    images = []
-    with fitz.open('pdf', pdf_bytes) as doc:
-        pdf_page_num = doc.page_count
-        end_page_id = (
-            end_page_id
-            if end_page_id is not None and end_page_id >= 0
-            else pdf_page_num - 1
-        )
-        if end_page_id > pdf_page_num - 1:
-            logger.warning('end_page_id is out of range, use images length')
-            end_page_id = pdf_page_num - 1
-
-        for index in range(0, doc.page_count):
-            if start_page_id <= index <= end_page_id:
-                page = doc[index]
-                mat = fitz.Matrix(dpi / 72, dpi / 72)
-                pm = page.get_pixmap(matrix=mat, alpha=False)
-
-                # If the width or height exceeds 4500 after scaling, do not scale further.
-                if pm.width > 4500 or pm.height > 4500:
-                    pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-                # Convert pixmap samples directly to numpy array
-                img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
-
-                img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
-            else:
-                img_dict = {'img': [], 'width': 0, 'height': 0}
-
-            images.append(img_dict)
-    return images
-
-
-def convert_page(bytes_page):
-    pdfs = fitz.open('pdf', bytes_page)
-    page = pdfs[0]
-    return fitz_doc_to_image(page)
-
-def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
-    """Process PDF pages in parallel with serialization-safe approach."""
-    if num_workers is None:
-        num_workers = mp.cpu_count()
-
-
-    # Process the extracted page data in parallel
-    with ProcessPoolExecutor(max_workers=num_workers) as executor:
-        # Process the page data
-        results = list(
-            executor.map(convert_page, pages)
-        )
-
-    return results
-
-
-def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
-    """Process all pages of a PDF using multiple threads.
-
-    Parameters:
-    -----------
-    pdf_path : str
-        Path to the PDF file
-    num_threads : int
-        Number of threads to use
-    **kwargs :
-        Additional arguments for fitz_doc_to_image
-
-    Returns:
-    --------
-    images : list
-        List of processed images, in page order
-    """
-    # Open the PDF
-    doc = fitz.open(pdf_path)
-    num_pages = len(doc)
-
-    # Create a list to store results in the correct order
-    results = [None] * num_pages
-
-    # Create a thread pool
-    with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        # Submit all tasks
-        futures = {}
-        for page_num in range(num_pages):
-            page = doc[page_num]
-            future = executor.submit(fitz_doc_to_image, page, **kwargs)
-            futures[future] = page_num
-        # Process results as they complete with progress bar
-        for future in as_completed(futures):
-            page_num = futures[future]
-            try:
-                results[page_num] = future.result()
-            except Exception as e:
-                print(f'Error processing page {page_num}: {e}')
-                results[page_num] = None
-
-    # Close the document
-    doc.close()
-
-if __name__ == '__main__':
-    pdf = fitz.open('/tmp/[MS-DOC].pdf')
-
-
-    pdf_page = [fitz.open() for i in range(pdf.page_count)]
-    [pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)]
-
-    pdf_page = [v.tobytes() for v in pdf_page]
-    results = parallel_process_pdf_safe(pdf_page, num_workers=16)
-
-    # threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
-
-    """ benchmark results of multi-threaded processing (fitz page to image)
-    total page nums: 578
-    thread nums,    time cost
-    1               7.351 sec
-    2               6.334 sec
-    4               5.968 sec
-    8               6.728 sec
-    16              8.085 sec
-    """
-
-    """ benchmark results of multi-processor processing (fitz page to image)
-    total page nums: 578
-    processor nums,    time cost
-    1                  17.170 sec
-    2                  10.170 sec
-    4                  7.841 sec
-    8                  7.900 sec
-    16                 7.984 sec
-    """
--- a/magic_pdf/dict2md/__init__.py
+++ b/magic_pdf/dict2md/__init__.py
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
-import re
-
-from loguru import logger
-
-from magic_pdf.config.make_content_config import DropMode, MakeMode
-from magic_pdf.config.ocr_content_type import BlockType, ContentType
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.config_reader import get_latex_delimiter_config
-from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.post_proc.para_split_v3 import ListLineTag
-
-
-def __is_hyphen_at_line_end(line):
-    """Check if a line ends with one or more letters followed by a hyphen.
-
-    Args:
-    line (str): The line of text to check.
-
-    Returns:
-    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
-    """
-    # Use regex to check if the line ends with one or more letters followed by a hyphen
-    return bool(re.search(r'[A-Za-z]+-\s*$', line))
-
-
-def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
-                                                img_buket_path):
-    markdown_with_para_and_pagination = []
-    page_no = 0
-    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get('para_blocks')
-        if not paras_of_layout:
-            markdown_with_para_and_pagination.append({
-                'page_no':
-                    page_no,
-                'md_content':
-                    '',
-            })
-            page_no += 1
-            continue
-        page_markdown = ocr_mk_markdown_with_para_core_v2(
-            paras_of_layout, 'mm', img_buket_path)
-        markdown_with_para_and_pagination.append({
-            'page_no':
-                page_no,
-            'md_content':
-                '\n\n'.join(page_markdown)
-        })
-        page_no += 1
-    return markdown_with_para_and_pagination
-
-
-def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
-                                      mode,
-                                      img_buket_path='',
-                                      ):
-    page_markdown = []
-    for para_block in paras_of_layout:
-        para_text = ''
-        para_type = para_block['type']
-        if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
-            para_text = merge_para_with_text(para_block)
-        elif para_type == BlockType.Title:
-            title_level = get_title_level(para_block)
-            para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
-        elif para_type == BlockType.InterlineEquation:
-            para_text = merge_para_with_text(para_block)
-        elif para_type == BlockType.Image:
-            if mode == 'nlp':
-                continue
-            elif mode == 'mm':
-                # 检测是否存在图片脚注
-                has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
-                # 如果存在图片脚注，则将图片脚注拼接到图片正文后面
-                if has_image_footnote:
-                    for block in para_block['blocks']:  # 1st.拼image_caption
-                        if block['type'] == BlockType.ImageCaption:
-                            para_text += merge_para_with_text(block) + '  \n'
-                    for block in para_block['blocks']:  # 2nd.拼image_body
-                        if block['type'] == BlockType.ImageBody:
-                            for line in block['lines']:
-                                for span in line['spans']:
-                                    if span['type'] == ContentType.Image:
-                                        if span.get('image_path', ''):
-                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
-                    for block in para_block['blocks']:  # 3rd.拼image_footnote
-                        if block['type'] == BlockType.ImageFootnote:
-                            para_text += '  \n' + merge_para_with_text(block)
-                else:
-                    for block in para_block['blocks']:  # 1st.拼image_body
-                        if block['type'] == BlockType.ImageBody:
-                            for line in block['lines']:
-                                for span in line['spans']:
-                                    if span['type'] == ContentType.Image:
-                                        if span.get('image_path', ''):
-                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
-                    for block in para_block['blocks']:  # 2nd.拼image_caption
-                        if block['type'] == BlockType.ImageCaption:
-                            para_text += '  \n' + merge_para_with_text(block)
-        elif para_type == BlockType.Table:
-            if mode == 'nlp':
-                continue
-            elif mode == 'mm':
-                for block in para_block['blocks']:  # 1st.拼table_caption
-                    if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block) + '  \n'
-                for block in para_block['blocks']:  # 2nd.拼table_body
-                    if block['type'] == BlockType.TableBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Table:
-                                    # if processed by table model
-                                    if span.get('html', ''):
-                                        para_text += f"\n{span['html']}\n"
-                                    elif span.get('image_path', ''):
-                                        para_text += f"![]({img_buket_path}/{span['image_path']})"
-                for block in para_block['blocks']:  # 3rd.拼table_footnote
-                    if block['type'] == BlockType.TableFootnote:
-                        para_text += '\n' + merge_para_with_text(block) + '  '
-
-        if para_text.strip() == '':
-            continue
-        else:
-            # page_markdown.append(para_text.strip() + '  ')
-            page_markdown.append(para_text.strip())
-
-    return page_markdown
-
-
-def detect_language(text):
-    en_pattern = r'[a-zA-Z]+'
-    en_matches = re.findall(en_pattern, text)
-    en_length = sum(len(match) for match in en_matches)
-    if len(text) > 0:
-        if en_length / len(text) >= 0.5:
-            return 'en'
-        else:
-            return 'unknown'
-    else:
-        return 'empty'
-
-
-def full_to_half(text: str) -> str:
-    """Convert full-width characters to half-width characters using code point manipulation.
-
-    Args:
-        text: String containing full-width characters
-
-    Returns:
-        String with full-width characters converted to half-width
-    """
-    result = []
-    for char in text:
-        code = ord(char)
-        # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
-        if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
-            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
-        else:
-            result.append(char)
-    return ''.join(result)
-
-latex_delimiters_config = get_latex_delimiter_config()
-
-default_delimiters = {
-    'display': {'left': '$$', 'right': '$$'},
-    'inline': {'left': '$', 'right': '$'}
-}
-
-delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
-
-display_left_delimiter = delimiters['display']['left']
-display_right_delimiter = delimiters['display']['right']
-inline_left_delimiter = delimiters['inline']['left']
-inline_right_delimiter = delimiters['inline']['right']
-
-def merge_para_with_text(para_block):
-    block_text = ''
-    for line in para_block['lines']:
-        for span in line['spans']:
-            if span['type'] in [ContentType.Text]:
-                span['content'] = full_to_half(span['content'])
-                block_text += span['content']
-    block_lang = detect_lang(block_text)
-
-    para_text = ''
-    for i, line in enumerate(para_block['lines']):
-
-        if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
-            para_text += '  \n'
-
-        for j, span in enumerate(line['spans']):
-
-            span_type = span['type']
-            content = ''
-            if span_type == ContentType.Text:
-                content = ocr_escape_special_markdown_char(span['content'])
-            elif span_type == ContentType.InlineEquation:
-                content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
-            elif span_type == ContentType.InterlineEquation:
-                content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
-
-            content = content.strip()
-
-            if content:
-                langs = ['zh', 'ja', 'ko']
-                # logger.info(f'block_lang: {block_lang}, content: {content}')
-                if block_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔,但是如果是行内公式结尾，还是要加空格
-                    if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
-                        para_text += content
-                    else:
-                        para_text += f'{content} '
-                else:
-                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
-                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
-                        if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
-                            para_text += content[:-1]
-                        else:  # 西方文本语境下 content间需要空格分隔
-                            para_text += f'{content} '
-                    elif span_type == ContentType.InterlineEquation:
-                        para_text += content
-            else:
-                continue
-    # 连写字符拆分
-    # para_text = __replace_ligatures(para_text)
-
-    return para_text
-
-
-def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
-    para_type = para_block['type']
-    para_content = {}
-    if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
-        para_content = {
-            'type': 'text',
-            'text': merge_para_with_text(para_block),
-        }
-    elif para_type == BlockType.Title:
-        para_content = {
-            'type': 'text',
-            'text': merge_para_with_text(para_block),
-        }
-        title_level = get_title_level(para_block)
-        if title_level != 0:
-            para_content['text_level'] = title_level
-    elif para_type == BlockType.InterlineEquation:
-        para_content = {
-            'type': 'equation',
-            'text': merge_para_with_text(para_block),
-            'text_format': 'latex',
-        }
-    elif para_type == BlockType.Image:
-        para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
-        for block in para_block['blocks']:
-            if block['type'] == BlockType.ImageBody:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        if span['type'] == ContentType.Image:
-                            if span.get('image_path', ''):
-                                para_content['img_path'] = join_path(img_buket_path, span['image_path'])
-            if block['type'] == BlockType.ImageCaption:
-                para_content['img_caption'].append(merge_para_with_text(block))
-            if block['type'] == BlockType.ImageFootnote:
-                para_content['img_footnote'].append(merge_para_with_text(block))
-    elif para_type == BlockType.Table:
-        para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
-        for block in para_block['blocks']:
-            if block['type'] == BlockType.TableBody:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        if span['type'] == ContentType.Table:
-
-                            if span.get('latex', ''):
-                                para_content['table_body'] = f"{span['latex']}"
-                            elif span.get('html', ''):
-                                para_content['table_body'] = f"{span['html']}"
-
-                            if span.get('image_path', ''):
-                                para_content['img_path'] = join_path(img_buket_path, span['image_path'])
-
-            if block['type'] == BlockType.TableCaption:
-                para_content['table_caption'].append(merge_para_with_text(block))
-            if block['type'] == BlockType.TableFootnote:
-                para_content['table_footnote'].append(merge_para_with_text(block))
-
-    para_content['page_idx'] = page_idx
-
-    if drop_reason is not None:
-        para_content['drop_reason'] = drop_reason
-
-    return para_content
-
-
-def union_make(pdf_info_dict: list,
-               make_mode: str,
-               drop_mode: str,
-               img_buket_path: str = '',
-               ):
-    output_content = []
-    for page_info in pdf_info_dict:
-        drop_reason_flag = False
-        drop_reason = None
-        if page_info.get('need_drop', False):
-            drop_reason = page_info.get('drop_reason')
-            if drop_mode == DropMode.NONE:
-                pass
-            elif drop_mode == DropMode.NONE_WITH_REASON:
-                drop_reason_flag = True
-            elif drop_mode == DropMode.WHOLE_PDF:
-                raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
-                                 f'drop_reason is {drop_reason}'))
-            elif drop_mode == DropMode.SINGLE_PAGE:
-                logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
-                                f'drop_reason is {drop_reason}'))
-                continue
-            else:
-                raise Exception('drop_mode can not be null')
-
-        paras_of_layout = page_info.get('para_blocks')
-        page_idx = page_info.get('page_idx')
-        if not paras_of_layout:
-            continue
-        if make_mode == MakeMode.MM_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(
-                paras_of_layout, 'mm', img_buket_path)
-            output_content.extend(page_markdown)
-        elif make_mode == MakeMode.NLP_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(
-                paras_of_layout, 'nlp')
-            output_content.extend(page_markdown)
-        elif make_mode == MakeMode.STANDARD_FORMAT:
-            for para_block in paras_of_layout:
-                if drop_reason_flag:
-                    para_content = para_to_standard_format_v2(
-                        para_block, img_buket_path, page_idx)
-                else:
-                    para_content = para_to_standard_format_v2(
-                        para_block, img_buket_path, page_idx)
-                output_content.append(para_content)
-    if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
-        return '\n\n'.join(output_content)
-    elif make_mode == MakeMode.STANDARD_FORMAT:
-        return output_content
-
-
-def get_title_level(block):
-    title_level = block.get('level', 1)
-    if title_level > 4:
-        title_level = 4
-    elif title_level < 1:
-        title_level = 0
-    return title_level
\ No newline at end of file
--- a/magic_pdf/filter/__init__.py
+++ b/magic_pdf/filter/__init__.py
-
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
-from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
-
-
-def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
-    """根据pdf的元数据，判断是文本pdf，还是ocr pdf."""
-    pdf_meta = pdf_meta_scan(pdf_bytes)
-    if pdf_meta.get('_need_drop', False):  # 如果返回了需要丢弃的标志，则抛出异常
-        raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
-    else:
-        is_encrypted = pdf_meta['is_encrypted']
-        is_needs_password = pdf_meta['is_needs_password']
-        if is_encrypted or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-            raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
-        else:
-            is_text_pdf, results = do_classify(
-                pdf_meta['total_page'],
-                pdf_meta['page_width_pts'],
-                pdf_meta['page_height_pts'],
-                pdf_meta['image_info_per_page'],
-                pdf_meta['text_len_per_page'],
-                pdf_meta['imgs_per_page'],
-                # pdf_meta['text_layout_per_page'],
-                pdf_meta['invalid_chars'],
-            )
-            if is_text_pdf:
-                return SupportedPdfParseMethod.TXT
-            else:
-                return SupportedPdfParseMethod.OCR
--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
-"""
-根据利用meta_scan得到的结果，对pdf是否为文字版进行分类。
-定义标准：
-一、什么pdf会是文字pdf，只要满足以下任意一条
-  1. 随机抽取N页，如果有任何一页文字数目大于100
-  2. 只要存在一个页面，图片的数量为0
-二、什么是扫描版pdf，只要满足以下任意一条
-  1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~
-  2. 大部分页面上文字的长度都是相等的。
-
-"""
-import json
-import sys
-from collections import Counter
-
-import click
-import numpy as np
-from loguru import logger
-
-from magic_pdf.libs.commons import mymax, get_top_percent_list
-from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
-
-TEXT_LEN_THRESHOLD = 100
-AVG_TEXT_LEN_THRESHOLD = 100
-TEXT_LEN_SAMPLE_RATIO = 0.1  # 抽取0.1的页面进行文字长度统计
-
-
-# 一个拼接图片的方案，将某些特殊扫描版本的拆图拼成一张整图
-def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
-    # 先通过set去除所有bbox重叠的图片数据
-    image_list_result = []
-    for page_images in image_list:
-        page_result = []
-        dedup = set()
-        for img in page_images:
-            x0, y0, x1, y1, img_bojid = img
-            if (x0, y0, x1, y1) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
-                continue
-            else:
-                dedup.add((x0, y0, x1, y1))
-                page_result.append([x0, y0, x1, y1, img_bojid])
-        image_list_result.append(page_result)
-
-    # 接下来，将同一页可拼接的图片进行合并
-    merged_images = []
-    for page_images in image_list_result:
-        if not page_images:
-            continue
-
-        # 先将同一页的图片从上到下，从左到右进行排序
-        page_images.sort(key=lambda img: (img[1], img[0]))
-
-        merged = [page_images[0]]
-
-        for img in page_images[1:]:
-            x0, y0, x1, y1, imgid = img
-
-            last_img = merged[-1]
-            last_x0, last_y0, last_x1, last_y1, last_imgid = last_img
-
-            # 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件
-            full_width = abs(x1 - x0) >= page_width * 0.9
-            full_height = abs(y1 - y0) >= page_height * 0.9
-
-            # 如果宽达标，检测是否能竖着拼
-            if full_width:
-                # 竖着拼需要满足两个前提，左右边界各偏移不能超过 max_offset，第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
-                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
-                            last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
-
-            # 如果高达标，检测是否可以横着拼
-            if full_height:
-                # 横着拼需要满足两个前提，上下边界各偏移不能超过 max_offset，第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
-                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
-                            last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
-
-            # Check if the image can be merged with the last image
-            if (full_width and close1) or (full_height and close2):
-                # Merge the image with the last image
-                merged[-1] = [min(x0, last_x0), min(y0, last_y0),
-                              max(x1, last_x1), max(y1, last_y1), imgid]
-            else:
-                # Add the image as a new image
-                merged.append(img)
-
-        merged_images.append(merged)
-
-    return merged_images
-
-
-def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text_len_list: list):
-    """
-    80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False，否则返回True
-    :param pdf_path:
-    :param total_page:
-    :param page_width:
-    :param page_height:
-    :param img_sz_list:
-    :return:
-    """
-    # # 只要有一页没有图片，那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf，上面有一些空白页面，既没有图片也没有文字。
-    # if any([len(img_sz) == 0 for img_sz in img_sz_list]):  # 含有不含图片的页面
-    #     # 现在找到这些页面的index
-    #     empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0]
-    #     # 然后检查这些页面上是否有文字
-    #     text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0]
-    #     if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD:  # 没有图片，但是有文字，说明可能是个文字版，如果没有文字则无法判断，留给下一步,现在要求这页文字量超过一定阈值
-    #         return True
-
-    # 通过objid去掉重复出现10次以上的图片，这些图片是隐藏的透明图层，其特点是id都一样
-    # 先对每个id出现的次数做个统计
-    objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
-    # 再去掉出现次数大于10的
-    if total_page >= scan_max_page:  # 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
-        total_page = scan_max_page
-
-    repeat_threshold = 2  # 把bad_image的阈值设为2
-    # repeat_threshold = min(2, total_page)  # 当total_page为1时，repeat_threshold为1，会产生误判导致所有img变成bad_img
-    bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
-    # bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])]
-    # text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0]
-
-    # 特殊情况，一个文字版pdf，每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上
-    # fake_image_ids = [objid for objid in bad_image_objid if
-    #                   any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for
-    #                        x0, y0, x1, y1, _ in images])]  # 原来的代码，any里面恒为true了，原因？？？
-    # fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images
-    #                   if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9]
-
-    # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]):  # 这些透明图片所在的页面上有文字大于阈值
-    #     return True
-
-    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
-                   img_sz_list]  # 过滤掉重复出现的图片
-
-    # 有的扫描版会把一页图片拆成很多张，需要先把图拼起来再计算
-    img_sz_list = merge_images(img_sz_list, page_width, page_height)
-
-    # 计算每个页面上最大的图的面积，然后计算这个面积占页面面积的比例
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
-                               img_sz_list]
-    page_area = page_width * page_height
-    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
-    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
-
-    if len(max_image_area_per_page) >= 0.5 * total_page:  # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
-        # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层，其特点是id都一样
-        return False
-    else:
-        return True
-
-
-def classify_by_text_len(text_len_list: list, total_page: int):
-    """
-    随机抽取10%的页面，如果少于5个页面，那么就取全部页面。
-    查看页面上的文字长度，如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD，那么就是文字pdf
-    :param total_page:
-    :param text_len_list:
-    :return:
-    """
-    select_page_cnt = int(total_page * TEXT_LEN_SAMPLE_RATIO)  # 选取10%的页面
-    if select_page_cnt < 5:
-        select_page_cnt = total_page
-
-    # # 排除头尾各10页
-    # if total_page > 20:  # 如果总页数大于20
-    #     page_range = list(range(10, total_page - 10))  # 从第11页到倒数第11页
-    # else:
-    #     page_range = list(range(total_page))  # 否则选择所有页面
-    # page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False)
-    # 排除前后10页对只有21，22页的pdf很尴尬，如果选出来的中间那一两页恰好没字容易误判，有了avg_words规则，这个规则可以忽略
-    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
-    text_len_lst = [text_len_list[i] for i in page_num]
-    is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
-    return is_text_pdf
-
-
-def classify_by_avg_words(text_len_list: list):
-    """
-    补充规则，如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD，就不是文字pdf
-    主要是各种图集
-    :param text_len_list:
-    :return:
-    """
-    sum_words = sum(text_len_list)
-    count_of_numbers = len(text_len_list)
-    if count_of_numbers == 0:
-        is_text_pdf = False
-    else:
-        avg_words = round(sum_words / count_of_numbers)
-        if avg_words > AVG_TEXT_LEN_THRESHOLD:
-            is_text_pdf = True
-        else:
-            is_text_pdf = False
-
-    return is_text_pdf
-
-
-def classify_by_img_num(img_sz_list: list, img_num_list: list):
-    """
-    补充规则，有一种扫描版本的PDF，每一页都会放所有的扫描页进去，在 metascan 时会被去重，
-    这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素，img_num_list中每一页的数量都很大且相同
-    :param img_sz_list:
-    :param img_num_list:
-    :return:
-    """
-    # 计算img_sz_list中非空元素的个数
-    count_img_sz_list_not_none = sum(1 for item in img_sz_list if item)
-    # 获取前80%的元素
-    top_eighty_percent = get_top_percent_list(img_num_list, 0.8)
-    # img_sz_list中非空元素的个数小于1，前80%的元素都相等，且最大值大于等于junk_limit_min
-    if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
-
-        #拿max和min的值,用来判断list内的值是否全都相等
-        # min_imgs = min(img_num_list)
-        # max_imgs = max(img_num_list)
-        #
-        # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
-        return False  # 如果满足这个条件，一定不是文字版pdf
-    else:
-        return True  # 不满足这三个条件，可能是文字版pdf，通过其他规则判断
-
-
-def classify_by_text_layout(text_layout_per_page: list):
-    """
-    判断文本布局是否以竖排为主。
-
-    Args:
-        text_layout_per_page (list): 文本布局列表，列表中的每个元素表示一页的文本布局，
-                                     值为'vertical'表示竖排，值为'horizontal'表示横排。
-
-    Returns:
-        bool: 若文本布局以竖排为主，则返回False；否则返回True。
-    """
-    # 统计text_layout_per_page中竖排的个数
-    count_vertical = sum(1 for item in text_layout_per_page if item == 'vertical')
-    # 统计text_layout_per_page中横排的个数
-    count_horizontal = sum(1 for item in text_layout_per_page if item == 'horizontal')
-    # 计算text_layout_per_page中竖排的占比
-    known_layout_cnt = count_vertical + count_horizontal
-    if known_layout_cnt != 0:
-        ratio = count_vertical / known_layout_cnt
-        if ratio >= 0.5:  # 阈值设为0.5，适配3页里面有2页和两页里有一页的情况
-            return False  # 文本布局以竖排为主，认为不是文字版pdf
-        else:
-            return True  # 文本布局以横排为主，认为是文字版pdf
-    else:
-        return False  # 文本布局未知，默认认为不是文字版pdf
-
-
-def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
-    """
-    判断一页是否由细长条组成，有两个条件：
-    1. 图片的宽或高达到页面宽或高的90%，且长边需要是窄边长度的数倍以上
-    2. 整个页面所有的图片有80%以上满足条件1
-
-    Args:
-        page_width (float): 页面宽度
-        page_height (float): 页面高度
-        img_sz_list (list): 图片尺寸列表，每个元素为一个元组，表示图片的矩形区域和尺寸，形如(x0, y0, x1, y1, size)，其中(x0, y0)为矩形区域的左上角坐标，(x1, y1)为矩形区域的右下角坐标，size为图片的尺寸
-
-    Returns:
-        bool: 如果满足条件的页面的比例小于0.5，返回True，否则返回False
-    """
-
-    def is_narrow_strip(img):
-        x0, y0, x1, y1, _ = img
-        width, height = x1 - x0, y1 - y0
-        return any([
-            # 图片宽度大于等于页面宽度的90%，且宽度大于等于高度4倍
-            width >= page_width * 0.9 and width >= height * 4,
-            # 图片高度大于等于页面高度的90%，且高度大于等于宽度4倍
-            height >= page_height * 0.9 and height >= width * 4,
-        ])
-
-    # 初始化满足条件的页面数量
-    narrow_strip_pages_count = 0
-
-    # 遍历所有页面
-    for page_img_list in img_sz_list:
-        # 忽略空页面
-        if not page_img_list:
-            continue
-
-        # 计算页面中的图片总数
-        total_images = len(page_img_list)
-
-        # 计算页面中细长条图片的数量
-        narrow_strip_images_count = 0
-        for img in page_img_list:
-            if is_narrow_strip(img):
-                narrow_strip_images_count += 1
-        # 如果细长条图片的数量少于5，跳过
-        if narrow_strip_images_count < 5:
-            continue
-        else:
-            # 如果细长条图片的比例大于或等于0.8，增加满足条件的页面数量
-            if narrow_strip_images_count / total_images >= 0.8:
-                narrow_strip_pages_count += 1
-
-    # 计算满足条件的页面的比例
-    narrow_strip_pages_ratio = narrow_strip_pages_count / len(img_sz_list)
-
-    return narrow_strip_pages_ratio < 0.5
-
-
-def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
-             # text_layout_list: list,
-             invalid_chars: bool):
-    """
-    这里的图片和页面长度单位是pts
-    :param total_page:
-    :param text_len_list:
-    :param page_width:
-    :param page_height:
-    :param img_sz_list:
-    :param pdf_path:
-    :return:
-    """
-    results = {
-        'by_image_area': classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list),
-        'by_text_len': classify_by_text_len(text_len_list, total_page),
-        'by_avg_words': classify_by_avg_words(text_len_list),
-        'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
-        # 'by_text_layout': classify_by_text_layout(text_layout_list),
-        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
-        'by_invalid_chars': invalid_chars,
-    }
-
-    if all(results.values()):
-        return True, results
-    elif not any(results.values()):
-        return False, results
-    else:
-        logger.warning(
-            f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
-            f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
-            # f" by_text_layout: {results['by_text_layout']},"
-            f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
-            f" by_invalid_chars: {results['by_invalid_chars']}",
-            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
-        return False, results
-
-
-@click.command()
-@click.option("--json-file", type=str, help="pdf信息")
-def main(json_file):
-    if json_file is None:
-        print("json_file is None", file=sys.stderr)
-        exit(0)
-    try:
-        with open(json_file, "r") as f:
-            for l in f:
-                if l.strip() == "":
-                    continue
-                o = json.loads(l)
-                total_page = o["total_page"]
-                page_width = o["page_width_pts"]
-                page_height = o["page_height_pts"]
-                img_sz_list = o["image_info_per_page"]
-                text_len_list = o['text_len_per_page']
-                text_layout_list = o['text_layout_per_page']
-                pdf_path = o['pdf_path']
-                is_encrypted = o['is_encrypted']
-                is_needs_password = o['is_needs_password']
-                if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-                    continue
-                tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
-                o['is_text_pdf'] = tag
-                print(json.dumps(o, ensure_ascii=False))
-    except Exception as e:
-        print("ERROR: ", e, file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
-    # false = False
-    # true = True
-    # null = None
-    # o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}}
-    # o = json.loads(json.dumps(o))
-    # total_page = o["total_page"]
-    # page_width = o["page_width_pts"]
-    # page_height = o["page_height_pts"]
-    # img_sz_list = o["image_info_per_page"]
-    # text_len_list = o['text_len_per_page']
-    # pdf_path = o['pdf_path']
-    # is_encrypted = o['is_encrypted']
-    # is_needs_password = o['is_needs_password']
-    # if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-    #     print("加密的")
-    #     exit(0)
-    # tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list)
-    # o['is_text_pdf'] = tag
-    # print(json.dumps(o, ensure_ascii=False))
--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
-"""输入： s3路径，每行一个 输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置."""
-
-from collections import Counter
-
-import fitz
-from loguru import logger
-
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.commons import get_top_percent_list, mymax
-from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
-
-scan_max_page = 50
-junk_limit_min = 10
-
-
-def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
-    max_image_area_per_page = [
-        mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz])
-        for page_img_sz in result
-    ]
-    page_area = int(page_width_pts) * int(page_height_pts)
-    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
-    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
-    return max_image_area_per_page
-
-
-def process_image(page, junk_img_bojids=[]):
-    page_result = []  # 存每个页面里的多张图四元组信息
-    items = page.get_images()
-    dedup = set()
-    for img in items:
-        #  这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
-        img_bojid = img[
-            0
-        ]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
-        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
-            continue
-        recs = page.get_image_rects(img, transform=True)
-        if recs:
-            rec = recs[0][0]
-            x0, y0, x1, y1 = map(int, rec)
-            width = x1 - x0
-            height = y1 - y0
-            if (
-                x0,
-                y0,
-                x1,
-                y1,
-                img_bojid,
-            ) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
-                continue
-            if not all(
-                [width, height]
-            ):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
-                continue
-            dedup.add((x0, y0, x1, y1, img_bojid))
-            page_result.append([x0, y0, x1, y1, img_bojid])
-    return page_result
-
-
-def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
-    """返回每个页面里的图片的四元组，每个页面多个图片。
-
-    :param doc:
-    :return:
-    """
-    #  使用 Counter 计数 img_bojid 的出现次数
-    img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
-    #  找出出现次数超过 len(doc) 半数的 img_bojid
-
-    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
-
-    junk_img_bojids = [
-        img_bojid
-        for img_bojid, count in img_bojid_counter.items()
-        if count >= junk_limit
-    ]
-
-    #  todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
-    #  有两种扫描版，一种文字版，这里可能会有误判
-    #  扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
-    #  扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
-    # 文  字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
-    imgs_len_list = [len(page.get_images()) for page in doc]
-
-    special_limit_pages = 10
-
-    #  统一用前十页结果做判断
-    result = []
-    break_loop = False
-    for i, page in enumerate(doc):
-        if break_loop:
-            break
-        if i >= special_limit_pages:
-            break
-        page_result = process_image(
-            page
-        )  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
-        result.append(page_result)
-        for item in result:
-            if not any(
-                item
-            ):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
-                if (
-                    max(imgs_len_list) == min(imgs_len_list)
-                    and max(imgs_len_list) >= junk_limit_min
-                ):  # 如果是特殊文字版，就把junklist置空并break
-                    junk_img_bojids = []
-                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
-                    pass
-                break_loop = True
-                break
-    if not break_loop:
-        # 获取前80%的元素
-        top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
-        # 检查前80%的元素是否都相等
-        if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
-            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
-            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
-
-            # 前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
-            max_image_area_per_page = calculate_max_image_area_per_page(
-                result, page_width_pts, page_height_pts
-            )
-            if (
-                len(max_image_area_per_page) < 0.8 * special_limit_pages
-            ):  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
-                junk_img_bojids = []
-            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
-                pass
-        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
-            junk_img_bojids = []
-
-    # 正式进入取前50页图片的信息流程
-    result = []
-    for i, page in enumerate(doc):
-        if i >= scan_max_page:
-            break
-        page_result = process_image(page, junk_img_bojids)
-        # logger.info(f"page {i} img_len: {len(page_result)}")
-        result.append(page_result)
-
-    return result, junk_img_bojids
-
-
-def get_pdf_page_size_pts(doc: fitz.Document):
-    page_cnt = len(doc)
-    l: int = min(page_cnt, 50)
-    # 把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
-    page_width_list = []
-    page_height_list = []
-    for i in range(l):
-        page = doc[i]
-        page_rect = page.rect
-        page_width_list.append(page_rect.width)
-        page_height_list.append(page_rect.height)
-
-    page_width_list.sort()
-    page_height_list.sort()
-
-    median_width = page_width_list[len(page_width_list) // 2]
-    median_height = page_height_list[len(page_height_list) // 2]
-
-    return median_width, median_height
-
-
-def get_pdf_textlen_per_page(doc: fitz.Document):
-    text_len_lst = []
-    for page in doc:
-        # 拿包含img和text的所有blocks
-        # text_block = page.get_text("blocks")
-        # 拿所有text的blocks
-        # text_block = page.get_text("words")
-        # text_block_len = sum([len(t[4]) for t in text_block])
-        # 拿所有text的str
-        text_block = page.get_text('text')
-        text_block_len = len(text_block)
-        # logger.info(f"page {page.number} text_block_len: {text_block_len}")
-        text_len_lst.append(text_block_len)
-
-    return text_len_lst
-
-
-def get_pdf_text_layout_per_page(doc: fitz.Document):
-    """根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
-
-    Args:
-        doc (fitz.Document): PDF文档对象。
-
-    Returns:
-        List[str]: 每一页的文本布局（横向、纵向、未知）。
-    """
-    text_layout_list = []
-
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 创建每一页的纵向和横向的文本行数计数器
-        vertical_count = 0
-        horizontal_count = 0
-        text_dict = page.get_text('dict')
-        if 'blocks' in text_dict:
-            for block in text_dict['blocks']:
-                if 'lines' in block:
-                    for line in block['lines']:
-                        # 获取line的bbox顶点坐标
-                        x0, y0, x1, y1 = line['bbox']
-                        # 计算bbox的宽高
-                        width = x1 - x0
-                        height = y1 - y0
-                        # 计算bbox的面积
-                        area = width * height
-                        font_sizes = []
-                        for span in line['spans']:
-                            if 'size' in span:
-                                font_sizes.append(span['size'])
-                        if len(font_sizes) > 0:
-                            average_font_size = sum(font_sizes) / len(font_sizes)
-                        else:
-                            average_font_size = (
-                                10  # 有的line拿不到font_size，先定一个阈值100
-                            )
-                        if (
-                            area <= average_font_size**2
-                        ):  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
-                            continue
-                        else:
-                            if 'wmode' in line:  # 通过wmode判断文本方向
-                                if line['wmode'] == 1:  # 判断是否为竖向文本
-                                    vertical_count += 1
-                                elif line['wmode'] == 0:  # 判断是否为横向文本
-                                    horizontal_count += 1
-                        #     if 'dir' in line:  # 通过旋转角度计算判断文本方向
-                        #         # 获取行的 "dir" 值
-                        #         dir_value = line['dir']
-                        #         cosine, sine = dir_value
-                        #         # 计算角度
-                        #         angle = math.degrees(math.acos(cosine))
-                        #
-                        #         # 判断是否为横向文本
-                        #         if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is horizontal:', line_text)
-                        #             horizontal_count += 1
-                        #         # 判断是否为纵向文本
-                        #         elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is vertical:', line_text)
-                        #             vertical_count += 1
-        # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-        # 判断每一页的文本布局
-        if vertical_count == 0 and horizontal_count == 0:  # 该页没有文本，无法判断
-            text_layout_list.append('unknow')
-            continue
-        else:
-            if vertical_count > horizontal_count:  # 该页的文本纵向行数大于横向的
-                text_layout_list.append('vertical')
-            else:  # 该页的文本横向行数大于纵向的
-                text_layout_list.append('horizontal')
-        # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-    return text_layout_list
-
-
-"""定义一个自定义异常用来抛出单页svg太多的pdf"""
-
-
-class PageSvgsTooManyError(Exception):
-    def __init__(self, message='Page SVGs are too many'):
-        self.message = message
-        super().__init__(self.message)
-
-
-def get_svgs_per_page(doc: fitz.Document):
-    svgs_len_list = []
-    for page_id, page in enumerate(doc):
-        # svgs = page.get_drawings()
-        svgs = page.get_cdrawings()  # 切换成get_cdrawings，效率更高
-        len_svgs = len(svgs)
-        if len_svgs >= 3000:
-            raise PageSvgsTooManyError()
-        else:
-            svgs_len_list.append(len_svgs)
-        # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
-    return svgs_len_list
-
-
-def get_imgs_per_page(doc: fitz.Document):
-    imgs_len_list = []
-    for page_id, page in enumerate(doc):
-        imgs = page.get_images()
-        imgs_len_list.append(len(imgs))
-        # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
-
-    return imgs_len_list
-
-
-def get_language(doc: fitz.Document):
-    """
-    获取PDF文档的语言。
-    Args:
-        doc (fitz.Document): PDF文档对象。
-    Returns:
-        str: 文档语言，如 "en-US"。
-    """
-    language_lst = []
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 拿所有text的str
-        text_block = page.get_text('text')
-        page_language = detect_lang(text_block)
-        language_lst.append(page_language)
-
-        # logger.info(f"page_id: {page_id}, page_language: {page_language}")
-
-    # 统计text_language_list中每种语言的个数
-    count_dict = Counter(language_lst)
-    # 输出text_language_list中出现的次数最多的语言
-    language = max(count_dict, key=count_dict.get)
-    return language
-
-
-def check_invalid_chars(pdf_bytes):
-    """乱码检测."""
-    # return detect_invalid_chars_by_pymupdf(pdf_bytes)
-    return detect_invalid_chars(pdf_bytes)
-
-
-def pdf_meta_scan(pdf_bytes: bytes):
-    """
-    :param s3_pdf_path:
-    :param pdf_bytes: pdf文件的二进制数据
-    几个维度来评价：是否加密，是否需要密码，纸张大小，总页数，是否文字可提取
-    """
-    doc = fitz.open('pdf', pdf_bytes)
-    is_needs_password = doc.needs_pass
-    is_encrypted = doc.is_encrypted
-    total_page = len(doc)
-    if total_page == 0:
-        logger.warning(f'drop this pdf, drop_reason: {DropReason.EMPTY_PDF}')
-        result = {'_need_drop': True, '_drop_reason': DropReason.EMPTY_PDF}
-        return result
-    else:
-        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
-        # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
-
-        # svgs_per_page = get_svgs_per_page(doc)
-        # logger.info(f"svgs_per_page: {svgs_per_page}")
-        imgs_per_page = get_imgs_per_page(doc)
-        # logger.info(f"imgs_per_page: {imgs_per_page}")
-
-        image_info_per_page, junk_img_bojids = get_image_info(
-            doc, page_width_pts, page_height_pts
-        )
-        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
-        text_len_per_page = get_pdf_textlen_per_page(doc)
-        # logger.info(f"text_len_per_page: {text_len_per_page}")
-        # text_layout_per_page = get_pdf_text_layout_per_page(doc)
-        # logger.info(f"text_layout_per_page: {text_layout_per_page}")
-        # text_language = get_language(doc)
-        # logger.info(f"text_language: {text_language}")
-        invalid_chars = check_invalid_chars(pdf_bytes)
-        # logger.info(f"invalid_chars: {invalid_chars}")
-
-        # 最后输出一条json
-        res = {
-            'is_needs_password': is_needs_password,
-            'is_encrypted': is_encrypted,
-            'total_page': total_page,
-            'page_width_pts': int(page_width_pts),
-            'page_height_pts': int(page_height_pts),
-            'image_info_per_page': image_info_per_page,
-            'text_len_per_page': text_len_per_page,
-            # 'text_layout_per_page': text_layout_per_page,
-            # 'text_language': text_language,
-            # "svgs_per_page": svgs_per_page,
-            'imgs_per_page': imgs_per_page,  # 增加每页img数量list
-            'junk_img_bojids': junk_img_bojids,  # 增加垃圾图片的bojid list
-            'invalid_chars': invalid_chars,
-            'metadata': doc.metadata,
-        }
-        # logger.info(json.dumps(res, ensure_ascii=False))
-        return res
-
-
-if __name__ == '__main__':
-    pass
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
-    # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
-    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")  # noqa: E501
-    # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
-    # doc = fitz.open("pdf", file_content)
-    # text_layout_lst = get_pdf_text_layout_per_page(doc)
-    # print(text_layout_lst)
--- a/magic_pdf/integrations/__init__.py
+++ b/magic_pdf/integrations/__init__.py
--- a/magic_pdf/integrations/rag/__init__.py
+++ b/magic_pdf/integrations/rag/__init__.py
--- a/magic_pdf/integrations/rag/api.py
+++ b/magic_pdf/integrations/rag/api.py
-import os
-from pathlib import Path
-
-from loguru import logger
-
-from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
-                                             Node)
-from magic_pdf.integrations.rag.utils import inference
-
-
-class RagPageReader:
-
-    def __init__(self, pagedata: LayoutElements):
-        self.o = [
-            Node(
-                category_type=v.category_type,
-                text=v.text,
-                image_path=v.image_path,
-                anno_id=v.anno_id,
-                latex=v.latex,
-                html=v.html,
-            ) for v in pagedata.layout_dets
-        ]
-
-        self.pagedata = pagedata
-
-    def __iter__(self):
-        return iter(self.o)
-
-    def get_rel_map(self) -> list[ElementRelation]:
-        return self.pagedata.extra.element_relation
-
-
-class RagDocumentReader:
-
-    def __init__(self, ragdata: list[LayoutElements]):
-        self.o = [RagPageReader(v) for v in ragdata]
-
-    def __iter__(self):
-        return iter(self.o)
-
-
-class DataReader:
-
-    def __init__(self, path_or_directory: str, method: str, output_dir: str):
-        self.path_or_directory = path_or_directory
-        self.method = method
-        self.output_dir = output_dir
-        self.pdfs = []
-        if os.path.isdir(path_or_directory):
-            for doc_path in Path(path_or_directory).glob('*.pdf'):
-                self.pdfs.append(doc_path)
-        else:
-            assert path_or_directory.endswith('.pdf')
-            self.pdfs.append(Path(path_or_directory))
-
-    def get_documents_count(self) -> int:
-        """Returns the number of documents in the directory."""
-        return len(self.pdfs)
-
-    def get_document_result(self, idx: int) -> RagDocumentReader | None:
-        """
-        Args:
-            idx (int): the index of documents under the
-                directory path_or_directory
-
-        Returns:
-            RagDocumentReader | None: RagDocumentReader is an iterable object,
-            more details @RagDocumentReader
-        """
-        if idx >= self.get_documents_count() or idx < 0:
-            logger.error(f'invalid idx: {idx}')
-            return None
-        res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
-        if res is None:
-            logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
-            return None
-        return RagDocumentReader(res)
-
-    def get_document_filename(self, idx: int) -> Path:
-        """get the filename of the document."""
-        return self.pdfs[idx]
--- a/magic_pdf/integrations/rag/type.py
+++ b/magic_pdf/integrations/rag/type.py
-from enum import Enum
-
-from pydantic import BaseModel, Field
-
-
-# rag
-class CategoryType(Enum):  # py310 not support StrEnum
-    text = 'text'
-    title = 'title'
-    interline_equation = 'interline_equation'
-    image = 'image'
-    image_body = 'image_body'
-    image_caption = 'image_caption'
-    table = 'table'
-    table_body = 'table_body'
-    table_caption = 'table_caption'
-    table_footnote = 'table_footnote'
-
-
-class ElementRelType(Enum):
-    sibling = 'sibling'
-
-
-class PageInfo(BaseModel):
-    page_no: int = Field(description='the index of page, start from zero',
-                         ge=0)
-    height: int = Field(description='the height of page', gt=0)
-    width: int = Field(description='the width of page', ge=0)
-    image_path: str | None = Field(description='the image of this page',
-                                   default=None)
-
-
-class ContentObject(BaseModel):
-    category_type: CategoryType = Field(description='类别')
-    poly: list[float] = Field(
-        description=('Coordinates, need to convert back to PDF coordinates,'
-                     ' order is top-left, top-right, bottom-right, bottom-left'
-                     ' x,y coordinates'))
-    ignore: bool = Field(description='whether ignore this object',
-                         default=False)
-    text: str | None = Field(description='text content of the object',
-                             default=None)
-    image_path: str | None = Field(description='path of embedded image',
-                                   default=None)
-    order: int = Field(description='the order of this object within a page',
-                       default=-1)
-    anno_id: int = Field(description='unique id', default=-1)
-    latex: str | None = Field(description='latex result', default=None)
-    html: str | None = Field(description='html result', default=None)
-
-
-class ElementRelation(BaseModel):
-    source_anno_id: int = Field(description='unique id of the source object',
-                                default=-1)
-    target_anno_id: int = Field(description='unique id of the target object',
-                                default=-1)
-    relation: ElementRelType = Field(
-        description='the relation between source and target element')
-
-
-class LayoutElementsExtra(BaseModel):
-    element_relation: list[ElementRelation] = Field(
-        description='the relation between source and target element')
-
-
-class LayoutElements(BaseModel):
-    layout_dets: list[ContentObject] = Field(
-        description='layout element details')
-    page_info: PageInfo = Field(description='page info')
-    extra: LayoutElementsExtra = Field(description='extra information')
-
-
-# iter data format
-class Node(BaseModel):
-    category_type: CategoryType = Field(description='类别')
-    text: str | None = Field(description='text content of the object',
-                             default=None)
-    image_path: str | None = Field(description='path of embedded image',
-                                   default=None)
-    anno_id: int = Field(description='unique id', default=-1)
-    latex: str | None = Field(description='latex result', default=None)
-    html: str | None = Field(description='html result', default=None)
--- a/magic_pdf/integrations/rag/utils.py
+++ b/magic_pdf/integrations/rag/utils.py
-import json
-import os
-from pathlib import Path
-
-from loguru import logger
-
-import magic_pdf.model as model_config
-from magic_pdf.config.ocr_content_type import BlockType, ContentType
-from magic_pdf.data.data_reader_writer import FileBasedDataReader
-from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
-from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
-                                             ElementRelation, ElementRelType,
-                                             LayoutElements,
-                                             LayoutElementsExtra, PageInfo)
-from magic_pdf.tools.common import do_parse, prepare_env
-
-
-def convert_middle_json_to_layout_elements(
-    json_data: dict,
-    output_dir: str,
-) -> list[LayoutElements]:
-    uniq_anno_id = 0
-
-    res: list[LayoutElements] = []
-    for page_no, page_data in enumerate(json_data['pdf_info']):
-        order_id = 0
-        page_info = PageInfo(
-            height=int(page_data['page_size'][1]),
-            width=int(page_data['page_size'][0]),
-            page_no=page_no,
-        )
-        layout_dets: list[ContentObject] = []
-        extra_element_relation: list[ElementRelation] = []
-
-        for para_block in page_data['para_blocks']:
-            para_text = ''
-            para_type = para_block['type']
-
-            if para_type == BlockType.Text:
-                para_text = merge_para_with_text(para_block)
-                x0, y0, x1, y1 = para_block['bbox']
-                content = ContentObject(
-                    anno_id=uniq_anno_id,
-                    category_type=CategoryType.text,
-                    text=para_text,
-                    order=order_id,
-                    poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                )
-                uniq_anno_id += 1
-                order_id += 1
-                layout_dets.append(content)
-
-            elif para_type == BlockType.Title:
-                para_text = merge_para_with_text(para_block)
-                x0, y0, x1, y1 = para_block['bbox']
-                content = ContentObject(
-                    anno_id=uniq_anno_id,
-                    category_type=CategoryType.title,
-                    text=para_text,
-                    order=order_id,
-                    poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                )
-                uniq_anno_id += 1
-                order_id += 1
-                layout_dets.append(content)
-
-            elif para_type == BlockType.InterlineEquation:
-                para_text = merge_para_with_text(para_block)
-                x0, y0, x1, y1 = para_block['bbox']
-                content = ContentObject(
-                    anno_id=uniq_anno_id,
-                    category_type=CategoryType.interline_equation,
-                    text=para_text,
-                    order=order_id,
-                    poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                )
-                uniq_anno_id += 1
-                order_id += 1
-                layout_dets.append(content)
-
-            elif para_type == BlockType.Image:
-                body_anno_id = -1
-                caption_anno_id = -1
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.ImageBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Image:
-                                    x0, y0, x1, y1 = block['bbox']
-                                    content = ContentObject(
-                                        anno_id=uniq_anno_id,
-                                        category_type=CategoryType.image_body,
-                                        image_path=os.path.join(
-                                            output_dir, span['image_path']),
-                                        order=order_id,
-                                        poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                                    )
-                                    body_anno_id = uniq_anno_id
-                                    uniq_anno_id += 1
-                                    order_id += 1
-                                    layout_dets.append(content)
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(block)
-                        x0, y0, x1, y1 = block['bbox']
-                        content = ContentObject(
-                            anno_id=uniq_anno_id,
-                            category_type=CategoryType.image_caption,
-                            text=para_text,
-                            order=order_id,
-                            poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                        )
-                        caption_anno_id = uniq_anno_id
-                        uniq_anno_id += 1
-                        order_id += 1
-                        layout_dets.append(content)
-
-                if body_anno_id > 0 and caption_anno_id > 0:
-                    element_relation = ElementRelation(
-                        relation=ElementRelType.sibling,
-                        source_anno_id=body_anno_id,
-                        target_anno_id=caption_anno_id,
-                    )
-                    extra_element_relation.append(element_relation)
-
-            elif para_type == BlockType.Table:
-                body_anno_id, caption_anno_id, footnote_anno_id = -1, -1, -1
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block)
-                        x0, y0, x1, y1 = block['bbox']
-                        content = ContentObject(
-                            anno_id=uniq_anno_id,
-                            category_type=CategoryType.table_caption,
-                            text=para_text,
-                            order=order_id,
-                            poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                        )
-                        caption_anno_id = uniq_anno_id
-                        uniq_anno_id += 1
-                        order_id += 1
-                        layout_dets.append(content)
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.TableBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Table:
-                                    x0, y0, x1, y1 = para_block['bbox']
-                                    content = ContentObject(
-                                        anno_id=uniq_anno_id,
-                                        category_type=CategoryType.table_body,
-                                        order=order_id,
-                                        poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                                    )
-                                    body_anno_id = uniq_anno_id
-                                    uniq_anno_id += 1
-                                    order_id += 1
-                                    # if processed by table model
-                                    if span.get('latex', ''):
-                                        content.latex = span['latex']
-                                    else:
-                                        content.image_path = os.path.join(
-                                            output_dir, span['image_path'])
-                                    layout_dets.append(content)
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(block)
-                        x0, y0, x1, y1 = block['bbox']
-                        content = ContentObject(
-                            anno_id=uniq_anno_id,
-                            category_type=CategoryType.table_footnote,
-                            text=para_text,
-                            order=order_id,
-                            poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                        )
-                        footnote_anno_id = uniq_anno_id
-                        uniq_anno_id += 1
-                        order_id += 1
-                        layout_dets.append(content)
-
-                if caption_anno_id != -1 and body_anno_id != -1:
-                    element_relation = ElementRelation(
-                        relation=ElementRelType.sibling,
-                        source_anno_id=body_anno_id,
-                        target_anno_id=caption_anno_id,
-                    )
-                    extra_element_relation.append(element_relation)
-
-                if footnote_anno_id != -1 and body_anno_id != -1:
-                    element_relation = ElementRelation(
-                        relation=ElementRelType.sibling,
-                        source_anno_id=body_anno_id,
-                        target_anno_id=footnote_anno_id,
-                    )
-                    extra_element_relation.append(element_relation)
-
-        res.append(
-            LayoutElements(
-                page_info=page_info,
-                layout_dets=layout_dets,
-                extra=LayoutElementsExtra(
-                    element_relation=extra_element_relation),
-            ))
-
-    return res
-
-
-def inference(path, output_dir, method):
-    model_config.__use_inside_model__ = True
-    model_config.__model_mode__ = 'full'
-    if output_dir == '':
-        if os.path.isdir(path):
-            output_dir = os.path.join(path, 'output')
-        else:
-            output_dir = os.path.join(os.path.dirname(path), 'output')
-
-    local_image_dir, local_md_dir = prepare_env(output_dir,
-                                                str(Path(path).stem), method)
-
-    def read_fn(path):
-        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path))
-
-    def parse_doc(doc_path: str):
-        try:
-            file_name = str(Path(doc_path).stem)
-            pdf_data = read_fn(doc_path)
-            do_parse(
-                output_dir,
-                file_name,
-                pdf_data,
-                [],
-                method,
-                False,
-                f_draw_span_bbox=False,
-                f_draw_layout_bbox=False,
-                f_dump_md=False,
-                f_dump_middle_json=True,
-                f_dump_model_json=False,
-                f_dump_orig_pdf=False,
-                f_dump_content_list=False,
-                f_draw_model_bbox=False,
-            )
-
-            middle_json_fn = os.path.join(local_md_dir,
-                                          f'{file_name}_middle.json')
-            with open(middle_json_fn) as fd:
-                jso = json.load(fd)
-            os.remove(middle_json_fn)
-            return convert_middle_json_to_layout_elements(jso, local_image_dir)
-
-        except Exception as e:
-            logger.exception(e)
-
-    return parse_doc(path)
-
-
-if __name__ == '__main__':
-    import pprint
-
-    base_dir = '/opt/data/pdf/resources/samples/'
-    if 0:
-        with open(base_dir + 'json_outputs/middle.json') as f:
-            d = json.load(f)
-        result = convert_middle_json_to_layout_elements(d, '/tmp')
-        pprint.pp(result)
-    if 0:
-        with open(base_dir + 'json_outputs/middle.3.json') as f:
-            d = json.load(f)
-        result = convert_middle_json_to_layout_elements(d, '/tmp')
-        pprint.pp(result)
-
-    if 1:
-        res = inference(
-            base_dir + 'samples/pdf/one_page_with_table_image.pdf',
-            '/tmp/output',
-            'ocr',
-        )
-        pprint.pp(res)
--- a/magic_pdf/libs/__init__.py
+++ b/magic_pdf/libs/__init__.py
--- a/magic_pdf/libs/boxbase.py
+++ b/magic_pdf/libs/boxbase.py
-import math
-
-
-def _is_in_or_part_overlap(box1, box2) -> bool:
-    """两个bbox是否有部分重叠或者包含."""
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    return not (x1_1 < x0_2 or  # box1在box2的左边
-                x0_1 > x1_2 or  # box1在box2的右边
-                y1_1 < y0_2 or  # box1在box2的上边
-                y0_1 > y1_2)  # box1在box2的下边
-
-
-def _is_in_or_part_overlap_with_area_ratio(box1,
-                                           box2,
-                                           area_ratio_threshold=0.6):
-    """判断box1是否在box2里面，或者box1和box2有部分重叠，且重叠面积占box1的比例超过area_ratio_threshold."""
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    if not _is_in_or_part_overlap(box1, box2):
-        return False
-
-    # 计算重叠面积
-    x_left = max(x0_1, x0_2)
-    y_top = max(y0_1, y0_2)
-    x_right = min(x1_1, x1_2)
-    y_bottom = min(y1_1, y1_2)
-    overlap_area = (x_right - x_left) * (y_bottom - y_top)
-
-    # 计算box1的面积
-    box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
-
-    return overlap_area / box1_area > area_ratio_threshold
-
-
-def _is_in(box1, box2) -> bool:
-    """box1是否完全在box2里面."""
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    return (x0_1 >= x0_2 and  # box1的左边界不在box2的左边外
-            y0_1 >= y0_2 and  # box1的上边界不在box2的上边外
-            x1_1 <= x1_2 and  # box1的右边界不在box2的右边外
-            y1_1 <= y1_2)  # box1的下边界不在box2的下边外
-
-
-def _is_part_overlap(box1, box2) -> bool:
-    """两个bbox是否有部分重叠，但不完全包含."""
-    if box1 is None or box2 is None:
-        return False
-
-    return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
-
-
-def _left_intersect(left_box, right_box):
-    """检查两个box的左边界是否有交集，也就是left_box的右边界是否在right_box的左边界内."""
-    if left_box is None or right_box is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = left_box
-    x0_2, y0_2, x1_2, y1_2 = right_box
-
-    return x1_1 > x0_2 and x0_1 < x0_2 and (y0_1 <= y0_2 <= y1_1
-                                            or y0_1 <= y1_2 <= y1_1)
-
-
-def _right_intersect(left_box, right_box):
-    """检查box是否在右侧边界有交集，也就是left_box的左边界是否在right_box的右边界内."""
-    if left_box is None or right_box is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = left_box
-    x0_2, y0_2, x1_2, y1_2 = right_box
-
-    return x0_1 < x1_2 and x1_1 > x1_2 and (y0_1 <= y0_2 <= y1_1
-                                            or y0_1 <= y1_2 <= y1_1)
-
-
-def _is_vertical_full_overlap(box1, box2, x_torlence=2):
-    """x方向上：要么box1包含box2, 要么box2包含box1。不能部分包含 y方向上：box1和box2有重叠."""
-    # 解析box的坐标
-    x11, y11, x12, y12 = box1  # 左上角和右下角的坐标 (x1, y1, x2, y2)
-    x21, y21, x22, y22 = box2
-
-    # 在x轴方向上，box1是否包含box2 或 box2包含box1
-    contains_in_x = (x11 - x_torlence <= x21 and x12 + x_torlence >= x22) or (
-        x21 - x_torlence <= x11 and x22 + x_torlence >= x12)
-
-    # 在y轴方向上，box1和box2是否有重叠
-    overlap_in_y = not (y12 < y21 or y11 > y22)
-
-    return contains_in_x and overlap_in_y
-
-
-def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
-    """检查box1下方和box2的上方有轻微的重叠，轻微程度收到y_tolerance的限制 这个函数和_is_vertical-
-    full_overlap的区别是，这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度."""
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-    tolerance_margin = 2
-    is_xdir_full_overlap = (
-        (x0_1 - tolerance_margin <= x0_2 <= x1_1 + tolerance_margin
-         and x0_1 - tolerance_margin <= x1_2 <= x1_1 + tolerance_margin)
-        or (x0_2 - tolerance_margin <= x0_1 <= x1_2 + tolerance_margin
-            and x0_2 - tolerance_margin <= x1_1 <= x1_2 + tolerance_margin))
-
-    return y0_2 < y1_1 and 0 < (y1_1 -
-                                y0_2) < y_tolerance and is_xdir_full_overlap
-
-
-def _is_left_overlap(
-    box1,
-    box2,
-):
-    """检查box1的左侧是否和box2有重叠 在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系，也就是无论box1在box2下
-    方还是box2在box1下方，都可以检测到重叠。 X方向上."""
-
-    def __overlap_y(Ay1, Ay2, By1, By2):
-        return max(0, min(Ay2, By2) - max(Ay1, By1))
-
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
-    ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1 - y0_1 != 0 else 0
-    ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2 - y0_2 != 0 else 0
-    vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
-
-    # vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
-    return x0_1 <= x0_2 <= x1_1 and vertical_overlap_cond
-
-
-def __is_overlaps_y_exceeds_threshold(bbox1,
-                                      bbox2,
-                                      overlap_ratio_threshold=0.8):
-    """检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
-    _, y0_1, _, y1_1 = bbox1
-    _, y0_2, _, y1_2 = bbox2
-
-    overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
-    height1, height2 = y1_1 - y0_1, y1_2 - y0_2
-    # max_height = max(height1, height2)
-    min_height = min(height1, height2)
-
-    return (overlap / min_height) > overlap_ratio_threshold
-
-
-def calculate_iou(bbox1, bbox2):
-    """计算两个边界框的交并比(IOU)。
-
-    Args:
-        bbox1 (list[float]): 第一个边界框的坐标，格式为 [x1, y1, x2, y2]，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
-        bbox2 (list[float]): 第二个边界框的坐标，格式与 `bbox1` 相同。
-
-    Returns:
-        float: 两个边界框的交并比(IOU)，取值范围为 [0, 1]。
-    """
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    intersection_area = (x_right - x_left) * (y_bottom - y_top)
-
-    # The area of both rectangles
-    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
-    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
-
-    if any([bbox1_area == 0, bbox2_area == 0]):
-        return 0
-
-    # Compute the intersection over union by taking the intersection area
-    # and dividing it by the sum of both areas minus the intersection area
-    iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
-
-    return iou
-
-
-def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
-    """计算box1和box2的重叠面积占最小面积的box的比例."""
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    intersection_area = (x_right - x_left) * (y_bottom - y_top)
-    min_box_area = min([(bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]),
-                        (bbox2[3] - bbox2[1]) * (bbox2[2] - bbox2[0])])
-    if min_box_area == 0:
-        return 0
-    else:
-        return intersection_area / min_box_area
-
-
-def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
-    """计算box1和box2的重叠面积占bbox1的比例."""
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    intersection_area = (x_right - x_left) * (y_bottom - y_top)
-    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
-    if bbox1_area == 0:
-        return 0
-    else:
-        return intersection_area / bbox1_area
-
-
-def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
-    """通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
-    如果比例大于ratio，则返回小的那个bbox, 否则返回None."""
-    x1_min, y1_min, x1_max, y1_max = bbox1
-    x2_min, y2_min, x2_max, y2_max = bbox2
-    area1 = (x1_max - x1_min) * (y1_max - y1_min)
-    area2 = (x2_max - x2_min) * (y2_max - y2_min)
-    overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
-    if overlap_ratio > ratio:
-        if area1 <= area2:
-            return bbox1
-        else:
-            return bbox2
-    else:
-        return None
-
-
-def get_bbox_in_boundary(bboxes: list, boundary: tuple) -> list:
-    x0, y0, x1, y1 = boundary
-    new_boxes = [
-        box for box in bboxes
-        if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1
-    ]
-    return new_boxes
-
-
-def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
-    """判断一个bbox是否在pdf页面的边缘."""
-    x0, x1 = bbox[0], bbox[2]
-    if x1 <= width * side_threshold or x0 >= width * (1 - side_threshold):
-        return True
-    return False
-
-
-def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
-    tolerance_margin = 4
-    top_boxes = [
-        box for box in pymu_blocks
-        if obj_bbox[1] - box['bbox'][3] >= -tolerance_margin
-        and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    top_boxes = [
-        box for box in top_boxes if any([
-            obj_bbox[0] - tolerance_margin <= box['bbox'][0] <= obj_bbox[2] +
-            tolerance_margin, obj_bbox[0] -
-            tolerance_margin <= box['bbox'][2] <= obj_bbox[2] +
-            tolerance_margin, box['bbox'][0] -
-            tolerance_margin <= obj_bbox[0] <= box['bbox'][2] +
-            tolerance_margin, box['bbox'][0] -
-            tolerance_margin <= obj_bbox[2] <= box['bbox'][2] +
-            tolerance_margin
-        ])
-    ]
-
-    # 然后找到y1最大的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
-        return top_boxes[0]
-    else:
-        return None
-
-
-def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
-    bottom_boxes = [
-        box for box in pymu_blocks if box['bbox'][1] -
-        obj_bbox[3] >= -2 and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    bottom_boxes = [
-        box for box in bottom_boxes if any([
-            obj_bbox[0] - 2 <= box['bbox'][0] <= obj_bbox[2] + 2, obj_bbox[0] -
-            2 <= box['bbox'][2] <= obj_bbox[2] + 2, box['bbox'][0] -
-            2 <= obj_bbox[0] <= box['bbox'][2] + 2, box['bbox'][0] -
-            2 <= obj_bbox[2] <= box['bbox'][2] + 2
-        ])
-    ]
-
-    # 然后找到y0最小的那个
-    if len(bottom_boxes) > 0:
-        bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
-        return bottom_boxes[0]
-    else:
-        return None
-
-
-def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
-    """寻找左侧最近的文本block."""
-    left_boxes = [
-        box for box in pymu_blocks if obj_bbox[0] -
-        box['bbox'][2] >= -2 and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    left_boxes = [
-        box for box in left_boxes if any([
-            obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] -
-            2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[3] <= box['bbox'][3] + 2
-        ])
-    ]
-
-    # 然后找到x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
-        return left_boxes[0]
-    else:
-        return None
-
-
-def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
-    """寻找右侧最近的文本block."""
-    right_boxes = [
-        box for box in pymu_blocks if box['bbox'][0] -
-        obj_bbox[2] >= -2 and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    right_boxes = [
-        box for box in right_boxes if any([
-            obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] -
-            2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[3] <= box['bbox'][3] + 2
-        ])
-    ]
-
-    # 然后找到x0最小的那个
-    if len(right_boxes) > 0:
-        right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
-        return right_boxes[0]
-    else:
-        return None
-
-
-def bbox_relative_pos(bbox1, bbox2):
-    """判断两个矩形框的相对位置关系.
-
-    Args:
-        bbox1: 一个四元组，表示第一个矩形框的左上角和右下角的坐标，格式为(x1, y1, x1b, y1b)
-        bbox2: 一个四元组，表示第二个矩形框的左上角和右下角的坐标，格式为(x2, y2, x2b, y2b)
-
-    Returns:
-        一个四元组，表示矩形框1相对于矩形框2的位置关系，格式为(left, right, bottom, top)
-        其中，left表示矩形框1是否在矩形框2的左侧，right表示矩形框1是否在矩形框2的右侧，
-        bottom表示矩形框1是否在矩形框2的下方，top表示矩形框1是否在矩形框2的上方
-    """
-    x1, y1, x1b, y1b = bbox1
-    x2, y2, x2b, y2b = bbox2
-
-    left = x2b < x1
-    right = x1b < x2
-    bottom = y2b < y1
-    top = y1b < y2
-    return left, right, bottom, top
-
-
-def bbox_distance(bbox1, bbox2):
-    """计算两个矩形框的距离。
-
-    Args:
-        bbox1 (tuple): 第一个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
-        bbox2 (tuple): 第二个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
-
-    Returns:
-        float: 矩形框之间的距离。
-    """
-
-    def dist(point1, point2):
-        return math.sqrt((point1[0] - point2[0])**2 +
-                         (point1[1] - point2[1])**2)
-
-    x1, y1, x1b, y1b = bbox1
-    x2, y2, x2b, y2b = bbox2
-
-    left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
-
-    if top and left:
-        return dist((x1, y1b), (x2b, y2))
-    elif left and bottom:
-        return dist((x1, y1), (x2b, y2b))
-    elif bottom and right:
-        return dist((x1b, y1), (x2, y2b))
-    elif right and top:
-        return dist((x1b, y1b), (x2, y2))
-    elif left:
-        return x1 - x2b
-    elif right:
-        return x2 - x1b
-    elif bottom:
-        return y1 - y2b
-    elif top:
-        return y2 - y1b
-    return 0.0
-
-
-def box_area(bbox):
-    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
-
-
-def get_overlap_area(bbox1, bbox2):
-    """计算box1和box2的重叠面积占bbox1的比例."""
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    return (x_right - x_left) * (y_bottom - y_top)
-
-
-def calculate_vertical_projection_overlap_ratio(block1, block2):
-    """
-    Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
-
-    Args:
-        block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
-        block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
-
-    Returns:
-        float: The proportion of the x-axis covered by the vertical projection of the two blocks.
-    """
-    x0_1, _, x1_1, _ = block1
-    x0_2, _, x1_2, _ = block2
-
-    # Calculate the intersection of the x-coordinates
-    x_left = max(x0_1, x0_2)
-    x_right = min(x1_1, x1_2)
-
-    if x_right < x_left:
-        return 0.0
-
-    # Length of the intersection
-    intersection_length = x_right - x_left
-
-    # Length of the x-axis projection of the first block
-    block1_length = x1_1 - x0_1
-
-    if block1_length == 0:
-        return 0.0
-
-    # Proportion of the x-axis covered by the intersection
-    # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
-    return intersection_length / block1_length
--- a/magic_pdf/libs/clean_memory.py
+++ b/magic_pdf/libs/clean_memory.py
-# Copyright (c) Opendatalab. All rights reserved.
-import torch
-import gc
-
-
-def clean_memory(device='cuda'):
-    if device == 'cuda':
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-    elif str(device).startswith("npu"):
-        import torch_npu
-        if torch_npu.npu.is_available():
-            torch_npu.npu.empty_cache()
-    elif str(device).startswith("mps"):
-        torch.mps.empty_cache()
-    gc.collect()
\ No newline at end of file
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
-
-def join_path(*args):
-    return '/'.join(str(s).rstrip('/') for s in args)
-
-
-def get_top_percent_list(num_list, percent):
-    """
-    获取列表中前百分之多少的元素
-    :param num_list:
-    :param percent:
-    :return:
-    """
-    if len(num_list) == 0:
-        top_percent_list = []
-    else:
-        # 对imgs_len_list排序
-        sorted_imgs_len_list = sorted(num_list, reverse=True)
-        # 计算 percent 的索引
-        top_percent_index = int(len(sorted_imgs_len_list) * percent)
-        # 取前80%的元素
-        top_percent_list = sorted_imgs_len_list[:top_percent_index]
-    return top_percent_list
-
-
-def mymax(alist: list):
-    if len(alist) == 0:
-        return 0  # 空是0， 0*0也是0大小q
-    else:
-        return max(alist)
-
-
-def parse_bucket_key(s3_full_path: str):
-    """
-    输入 s3://bucket/path/to/my/file.txt
-    输出 bucket, path/to/my/file.txt
-    """
-    s3_full_path = s3_full_path.strip()
-    if s3_full_path.startswith("s3://"):
-        s3_full_path = s3_full_path[5:]
-    if s3_full_path.startswith("/"):
-        s3_full_path = s3_full_path[1:]
-    bucket, key = s3_full_path.split("/", 1)
-    return bucket, key
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
-"""根据bucket的名字返回对应的s3 AK， SK，endpoint三元组."""
-
-import json
-import os
-
-from loguru import logger
-
-from magic_pdf.config.constants import MODEL_NAME
-from magic_pdf.libs.commons import parse_bucket_key
-
-# 定义配置文件名常量
-CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'magic-pdf.json')
-
-
-def read_config():
-    if os.path.isabs(CONFIG_FILE_NAME):
-        config_file = CONFIG_FILE_NAME
-    else:
-        home_dir = os.path.expanduser('~')
-        config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
-
-    if not os.path.exists(config_file):
-        raise FileNotFoundError(f'{config_file} not found')
-
-    with open(config_file, 'r', encoding='utf-8') as f:
-        config = json.load(f)
-    return config
-
-
-def get_s3_config(bucket_name: str):
-    """~/magic-pdf.json 读出来."""
-    config = read_config()
-
-    bucket_info = config.get('bucket_info')
-    if bucket_name not in bucket_info:
-        access_key, secret_key, storage_endpoint = bucket_info['[default]']
-    else:
-        access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
-
-    if access_key is None or secret_key is None or storage_endpoint is None:
-        raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}')
-
-    # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
-
-    return access_key, secret_key, storage_endpoint
-
-
-def get_s3_config_dict(path: str):
-    access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
-    return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint}
-
-
-def get_bucket_name(path):
-    bucket, key = parse_bucket_key(path)
-    return bucket
-
-
-def get_local_models_dir():
-    config = read_config()
-    models_dir = config.get('models-dir')
-    if models_dir is None:
-        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
-        return '/tmp/models'
-    else:
-        return models_dir
-
-
-def get_local_layoutreader_model_dir():
-    config = read_config()
-    layoutreader_model_dir = config.get('layoutreader-model-dir')
-    if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir):
-        home_dir = os.path.expanduser('~')
-        layoutreader_at_modelscope_dir_path = os.path.join(home_dir, '.cache/modelscope/hub/ppaanngggg/layoutreader')
-        logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default")
-        return layoutreader_at_modelscope_dir_path
-    else:
-        return layoutreader_model_dir
-
-
-def get_device():
-    config = read_config()
-    device = config.get('device-mode')
-    if device is None:
-        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
-        return 'cpu'
-    else:
-        return device
-
-
-def get_table_recog_config():
-    config = read_config()
-    table_config = config.get('table-config')
-    if table_config is None:
-        logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
-        return json.loads(f'{{"model": "{MODEL_NAME.RAPID_TABLE}","enable": false, "max_time": 400}}')
-    else:
-        return table_config
-
-
-def get_layout_config():
-    config = read_config()
-    layout_config = config.get('layout-config')
-    if layout_config is None:
-        logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
-        return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
-    else:
-        return layout_config
-
-
-def get_formula_config():
-    config = read_config()
-    formula_config = config.get('formula-config')
-    if formula_config is None:
-        logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
-        return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
-    else:
-        return formula_config
-
-def get_llm_aided_config():
-    config = read_config()
-    llm_aided_config = config.get('llm-aided-config')
-    if llm_aided_config is None:
-        logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
-        return None
-    else:
-        return llm_aided_config
-
-def get_latex_delimiter_config():
-    config = read_config()
-    latex_delimiter_config = config.get('latex-delimiter-config')
-    if latex_delimiter_config is None:
-        logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
-        return None
-    else:
-        return latex_delimiter_config
-
-
-if __name__ == '__main__':
-    ak, sk, endpoint = get_s3_config('llm-raw')