add an option to freely output 'badcase.json

23bacc60 · Shuimo · d1457937 · 4191fa96 · 23bacc60 · 23bacc60
Commit 23bacc60 authored Apr 23, 2024 by Shuimo
20 changed files
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -40,15 +40,20 @@ jobs:
          pip install -r requirements.txt
        fi

-
-    - name: benchmark
+    - name: config-net-reset
+      run: |
+        export http_proxy=""
+        export https_proxy=""
+    - name: get-benchmark-result
      run: |
        echo "start test"
-        cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip badcase.json overall.json base_data.json
+        cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK  --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
+        python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK  --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
+  
  notify_to_feishu:
    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
    needs: [pdf-test]
-    runs-on: [pdf]
+    runs-on: pdf
    steps:
    - name: notify
      run: |

--- a/README.md
+++ b/README.md
@@ -22,15 +22,15 @@ git clone https://github.com/magicpdf/Magic-PDF.git
 2.Install the requirements

 ```sh
+cd Magic-PDF
 pip install -r requirements.txt
 ```

-3.Run the main script
+3.Run the command line

 ```sh
-use demo/text_demo.py
-or
-use demo/ocr_demo.py
+export PYTHONPATH=.
+python magic_pdf/cli/magicpdf.py --help
 ```

 ### 版权说明

--- a/demo/text_demo.py
+++ b/demo/text_demo.py
@@ -15,7 +15,7 @@ from loguru import logger

 from magic_pdf.libs.config_reader import get_s3_config_dict
 from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
-from magic_pdf.spark.base import get_data_source
+from magic_pdf.spark.spark_api import get_data_source


 def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
@@ -67,9 +67,7 @@ def demo_classify_by_type(book_name=None, debug_mode=True):
    img_num_list = pdf_meta["imgs_per_page"]
    text_len_list = pdf_meta["text_len_per_page"]
    text_layout_list = pdf_meta["text_layout_per_page"]
-    pdf_path = json_object.get("file_location")
    is_text_pdf, results = classify(
-        pdf_path,
        total_page,
        page_width,
        page_height,
@@ -89,7 +87,7 @@ def demo_meta_scan(book_name=None, debug_mode=True):
    s3_pdf_path = json_object.get("file_location")
    s3_config = get_s3_config_dict(s3_pdf_path)
    pdf_bytes = read_file(s3_pdf_path, s3_config)
-    res = pdf_meta_scan(s3_pdf_path, pdf_bytes)
+    res = pdf_meta_scan(pdf_bytes)

    logger.info(json.dumps(res, ensure_ascii=False))
    write_json_to_local(res, book_name)

--- a/magic_pdf/cli/magicpdf.py
+++ b/magic_pdf/cli/magicpdf.py
@@ -21,28 +21,175 @@ python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
 """

+import os
+import json as json_parse
+import click
+from loguru import logger
+from pathlib import Path

+from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.pipe.OCRPipe import OCRPipe
+from magic_pdf.pipe.TXTPipe import TXTPipe
+from magic_pdf.libs.config_reader import get_s3_config
+from magic_pdf.libs.path_utils import (
+    parse_s3path,
+    parse_s3_range_params,
+    remove_non_official_s3_args,
+)
+from magic_pdf.libs.config_reader import get_local_dir
+from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter

+parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
+
+
+def prepare_env(pdf_file_name):
+    local_parent_dir = os.path.join(
+        get_local_dir(), "magic-pdf", pdf_file_name
+    )
+
+    local_image_dir = os.path.join(local_parent_dir, "images")
+    local_md_dir = local_parent_dir
+    os.makedirs(local_image_dir, exist_ok=True)
+    os.makedirs(local_md_dir, exist_ok=True)
+    return local_image_dir, local_md_dir
+
+
+def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
+    if parse_method == "auto":
+        pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+    elif parse_method == "txt":
+        pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+    elif parse_method == "ocr":
+        pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+    else:
+        print("unknow parse method")
+        os.exit(1)
+
+    pipe.pipe_classify()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown()
+    #part_file_name = datetime.now().strftime("%H-%M-%S")
+    md_writer.write(
+        content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
+    )
+    md_writer.write(
+        content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
+        path=f"{pdf_file_name}.json",
+        mode=AbsReaderWriter.MODE_TXT,
+    )
+    # try:
+    #     content_list = pipe.pipe_mk_uni_format()
+    # except Exception as e:
+    #     logger.exception(e)
+    # md_writer.write(
+    #     str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
+    # )

-import click

 @click.group()
 def cli():
    pass

+
 @cli.command()
-@click.option('--json', type=str, help='输入一个S3路径')
-def json_command(json):
-    # 这里处理json相关的逻辑
-    print(f'处理JSON: {json}')
+@click.option("--json", type=str, help="输入一个S3路径")
+@click.option(
+    "--method",
+    type=parse_pdf_methods,
+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
+    default="auto",
+)
+def json_command(json, method):
+    if not json.startswith("s3://"):
+        print("usage: python magipdf.py --json s3://some_bucket/some_path")
+        os.exit(1)
+
+    def read_s3_path(s3path):
+        bucket, key = parse_s3path(s3path)
+
+        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
+        s3_rw = S3ReaderWriter(
+            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
+        )
+        may_range_params = parse_s3_range_params(s3path)
+        if may_range_params is None or 2 != len(may_range_params):
+            byte_start, byte_end = 0, None
+        else:
+            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
+            byte_end += byte_start - 1
+        return s3_rw.read_jsonl(
+            remove_non_official_s3_args(s3path),
+            byte_start,
+            byte_end,
+            AbsReaderWriter.MODE_BIN,
+        )
+
+    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
+    s3_file_path = jso["file_location"]
+    pdf_file_name = Path(s3_file_path).stem
+    pdf_data = read_s3_path(s3_file_path)
+    local_image_dir, local_md_dir = prepare_env(pdf_file_name)
+    
+    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
+        local_md_dir
+    )
+
+    _do_parse(
+        pdf_file_name,
+        pdf_data,
+        jso["doc_layout_result"],
+        method,
+        local_image_rw,
+        local_md_rw,
+        os.path.basename(local_image_dir),
+    )
+

 @cli.command()
-@click.option('--pdf', type=click.Path(exists=True), required=True, help='PDF文件的路径')
-@click.option('--model', type=click.Path(exists=True), help='模型的路径')
-def pdf_command(pdf, model):
+@click.option(
+    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
+)
+@click.option("--model", type=click.Path(exists=True), help="模型的路径")
+@click.option(
+    "--method",
+    type=parse_pdf_methods,
+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
+    default="auto",
+)
+def pdf_command(pdf, model, method):
    # 这里处理pdf和模型相关的逻辑
-    print(f'处理PDF: {pdf}')
-    print(f'加载模型: {model}')
+    if model is None:
+        model = pdf.replace(".pdf", ".json")
+        if not os.path.exists(model):
+            print(f"make sure json file existed and place under {os.dirname(pdf)}")
+            os.exit(1)
+
+    def read_fn(path):
+        disk_rw = DiskReaderWriter(os.path.dirname(path))
+        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
+
+    pdf_data = read_fn(pdf)
+    jso = json_parse.loads(read_fn(model).decode("utf-8"))
+    pdf_file_name = Path(pdf).stem
+    local_image_dir, local_md_dir = prepare_env(pdf_file_name)
+    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
+        local_md_dir
+    )
+    _do_parse(
+        pdf_file_name,
+        pdf_data,
+        jso,
+        method,
+        local_image_rw,
+        local_md_rw,
+        os.path.basename(local_image_dir),
+    )
+

-if __name__ == '__main__':
+if __name__ == "__main__":
+    """
+    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
+    """
    cli()
--- a/magic_pdf/dict2md/mkcontent.py
+++ b/magic_pdf/dict2md/mkcontent.py
@@ -2,6 +2,7 @@ import math
 from loguru import logger

 from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
+from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.ocr_content_type import ContentType

 TYPE_INLINE_EQUATION = ContentType.InlineEquation
@@ -227,12 +228,12 @@ def __insert_before_para(text, type, element, content_list):
        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
         

-def mk_universal_format(para_dict: dict):
+def mk_universal_format(pdf_info_list: list, img_buket_path):
    """
    构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
    """
    content_lst = []
-    for _, page_info in para_dict.items():
+    for page_info in pdf_info_list:
        page_lst = [] # 一个page内的段落列表
        para_blocks = page_info.get("para_blocks")
        pymu_raw_blocks = page_info.get("preproc_blocks")
@@ -249,7 +250,7 @@ def mk_universal_format(para_dict: dict):
            for img in all_page_images:
                content_node = {
                    "type": "image",
-                    "img_path": img['image_path'],
+                    "img_path": join_path(img_buket_path, img['image_path']),
                    "img_alt":"",
                    "img_title":"",
                    "img_caption":""
@@ -258,7 +259,7 @@ def mk_universal_format(para_dict: dict):
            for table in all_page_tables:
                content_node = {
                    "type": "table",
-                    "img_path": table['image_path'],
+                    "img_path": join_path(img_buket_path, table['image_path']),
                    "table_latex": table.get("text"),
                    "table_title": "",
                    "table_caption": "",

--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
+from loguru import logger
+
+from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 import wordninja
 import re

@@ -16,90 +19,41 @@ def split_long_words(text):
    return ' '.join(segments)


-def ocr_mk_nlp_markdown(pdf_info_dict: dict):
-    markdown = []
-
-    for _, page_info in pdf_info_dict.items():
-        blocks = page_info.get("preproc_blocks")
-        if not blocks:
-            continue
-        for block in blocks:
-            for line in block['lines']:
-                line_text = ''
-                for span in line['spans']:
-                    if not span.get('content'):
-                        continue
-                    content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
-                    if span['type'] == ContentType.InlineEquation:
-                        content = f"${content}$"
-                    elif span['type'] == ContentType.InterlineEquation:
-                        content = f"$$\n{content}\n$$"
-                    line_text += content + ' '
-                # 在行末添加两个空格以强制换行
-                markdown.append(line_text.strip() + '  ')
-    return '\n'.join(markdown)
-
-
-def ocr_mk_mm_markdown(pdf_info_dict: dict):
-    markdown = []
-
-    for _, page_info in pdf_info_dict.items():
-        blocks = page_info.get("preproc_blocks")
-        if not blocks:
-            continue
-        for block in blocks:
-            for line in block['lines']:
-                line_text = ''
-                for span in line['spans']:
-                    if not span.get('content'):
-                        if not span.get('image_path'):
-                            continue
-                        else:
-                            content = f"![]({span['image_path']})"
-                    else:
-                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
-                        if span['type'] == ContentType.InlineEquation:
-                            content = f"${content}$"
-                        elif span['type'] == ContentType.InterlineEquation:
-                            content = f"$$\n{content}\n$$"
-                    line_text += content + ' '
-                # 在行末添加两个空格以强制换行
-                markdown.append(line_text.strip() + '  ')
-    return '\n'.join(markdown)
-
-
-def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
+def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
    markdown = []
-    for _, page_info in pdf_info_dict.items():
+    for page_info in pdf_info_list:
        paras_of_layout = page_info.get("para_blocks")
-        page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm")
+        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)


-def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
+def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
    markdown = []
-    for _, page_info in pdf_info_dict.items():
+    for page_info in pdf_info_dict:
        paras_of_layout = page_info.get("para_blocks")
-        page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "nlp")
+        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)

-def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
+
+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
    markdown_with_para_and_pagination = []
-    for page_no, page_info in pdf_info_dict.items():
+    page_no = 0
+    for page_info in pdf_info_dict:
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
            continue
-        page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm")
+        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
        markdown_with_para_and_pagination.append({
            'page_no': page_no,
            'md_content': '\n\n'.join(page_markdown)
        })
+        page_no += 1
    return markdown_with_para_and_pagination


-def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
+def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
    page_markdown = []
    for paras in paras_of_layout:
        for para in paras:
@@ -122,7 +76,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
                        content = f"\n$$\n{span['content']}\n$$\n"
                    elif span_type in [ContentType.Image, ContentType.Table]:
                        if mode == 'mm':
-                            content = f"\n![]({span['image_path']})\n"
+                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
                        elif mode == 'nlp':
                            pass
                    if content != '':
@@ -137,10 +91,86 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
    return page_markdown


-def para_to_standard_format(para):
+def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
+    page_markdown = []
+    for para_block in paras_of_layout:
+        para_text = ''
+        para_type = para_block.get('type')
+        if para_type == BlockType.Text:
+            para_text = merge_para_with_text(para_block)
+        elif para_type == BlockType.Title:
+            para_text = f"# {merge_para_with_text(para_block)}"
+        elif para_type == BlockType.InterlineEquation:
+            para_text = merge_para_with_text(para_block)
+        elif para_type == BlockType.Image:
+            if mode == 'nlp':
+                continue
+            elif mode == 'mm':
+                img_blocks = para_block.get('blocks')
+                for img_block in img_blocks:
+                    if img_block.get('type') == BlockType.ImageBody:
+                        for line in img_block.get('lines'):
+                            for span in line['spans']:
+                                if span.get('type') == ContentType.Image:
+                                    para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
+                for img_block in img_blocks:
+                    if img_block.get('type') == BlockType.ImageCaption:
+                        para_text += merge_para_with_text(img_block)
+        elif para_type == BlockType.Table:
+            if mode == 'nlp':
+                continue
+            elif mode == 'mm':
+                table_blocks = para_block.get('blocks')
+                for table_block in table_blocks:
+                    if table_block.get('type') == BlockType.TableBody:
+                        for line in table_block.get('lines'):
+                            for span in line['spans']:
+                                if span.get('type') == ContentType.Table:
+                                    para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
+                for table_block in table_blocks:
+                    if table_block.get('type') == BlockType.TableCaption:
+                        para_text += merge_para_with_text(table_block)
+                    elif table_block.get('type') == BlockType.TableFootnote:
+                        para_text += merge_para_with_text(table_block)
+
+        if para_text.strip() == '':
+            continue
+        else:
+            page_markdown.append(para_text.strip() + '  ')
+
+    return page_markdown
+
+
+def merge_para_with_text(para):
+    para_text = ''
+    for line in para['lines']:
+        for span in line['spans']:
+            span_type = span.get('type')
+            content = ''
+            language = ''
+            if span_type == ContentType.Text:
+                content = span['content']
+                language = detect_lang(content)
+                if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
+                    content = ocr_escape_special_markdown_char(split_long_words(content))
+                else:
+                    content = ocr_escape_special_markdown_char(content)
+            elif span_type == ContentType.InlineEquation:
+                content = f"${span['content']}$"
+            elif span_type == ContentType.InterlineEquation:
+                content = f"\n$$\n{span['content']}\n$$\n"
+            if content != '':
+                if language == 'en':  # 英文语境下 content间需要空格分隔
+                    para_text += content + ' '
+                else:  # 中文语境下，content间不需要空格分隔
+                    para_text += content
+    return para_text
+
+
+def para_to_standard_format(para, img_buket_path):
    para_content = {}
    if len(para) == 1:
-        para_content = line_to_standard_format(para[0])
+        para_content = line_to_standard_format(para[0], img_buket_path)
    elif len(para) > 1:
        para_text = ''
        inline_equation_num = 0
@@ -148,6 +178,7 @@ def para_to_standard_format(para):
            for span in line['spans']:
                language = ''
                span_type = span.get('type')
+                content = ""
                if span_type == ContentType.Text:
                    content = span['content']
                    language = detect_lang(content)
@@ -170,20 +201,21 @@ def para_to_standard_format(para):
        }
    return para_content

-def make_standard_format_with_para(pdf_info_dict: dict):
+
+def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
    content_list = []
-    for _, page_info in pdf_info_dict.items():
+    for page_info in pdf_info_dict:
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
            continue
        for paras in paras_of_layout:
            for para in paras:
-                para_content = para_to_standard_format(para)
+                para_content = para_to_standard_format(para, img_buket_path)
                content_list.append(para_content)
    return content_list


-def line_to_standard_format(line):
+def line_to_standard_format(line, img_buket_path):
    line_text = ""
    inline_equation_num = 0
    for span in line['spans']:
@@ -194,13 +226,13 @@ def line_to_standard_format(line):
                if span['type'] == ContentType.Image:
                    content = {
                        'type': 'image',
-                        'img_path': span['image_path']
+                        'img_path': join_path(img_buket_path, span['image_path'])
                    }
                    return content
                elif span['type'] == ContentType.Table:
                    content = {
                        'type': 'table',
-                        'img_path': span['image_path']
+                        'img_path': join_path(img_buket_path, span['image_path'])
                    }
                    return content
        else:
@@ -226,7 +258,7 @@ def line_to_standard_format(line):
    return content


-def ocr_mk_mm_standard_format(pdf_info_dict: dict):
+def ocr_mk_mm_standard_format(pdf_info_dict: list):
    """
    content_list
    type         string      image/text/table/equation(行间的单独拿出来，行内的和text合并)
@@ -236,7 +268,7 @@ def ocr_mk_mm_standard_format(pdf_info_dict: dict):
    img_path     string      s3://full/path/to/img.jpg
    """
    content_list = []
-    for _, page_info in pdf_info_dict.items():
+    for page_info in pdf_info_dict:
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue

--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
@@ -15,6 +15,7 @@ from collections import Counter

 import click
 import numpy as np
+from loguru import logger

 from magic_pdf.libs.commons import mymax, get_top_percent_list
 from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
@@ -298,7 +299,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
    return narrow_strip_pages_ratio < 0.5


-def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
+def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
    """
    这里的图片和页面长度单位是pts
    :param total_page:
@@ -323,7 +324,7 @@ def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: li
    elif not any(results.values()):
        return False, results
    else:
-        print(f"WARNING: {pdf_path} is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
+        logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
        return False, results


@@ -350,7 +351,7 @@ def main(json_file):
                is_needs_password = o['is_needs_password']
                if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
                    continue
-                tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
+                tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
                o['is_text_pdf'] = tag
                print(json.dumps(o, ensure_ascii=False))
    except Exception as e:

--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
@@ -287,7 +287,7 @@ def get_language(doc: fitz.Document):
    return language


-def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
+def pdf_meta_scan(pdf_bytes: bytes):
    """
    :param s3_pdf_path:
    :param pdf_bytes: pdf文件的二进制数据
@@ -298,8 +298,8 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
    is_encrypted = doc.is_encrypted
    total_page = len(doc)
    if total_page == 0:
-        logger.warning(f"drop this pdf: {s3_pdf_path}, drop_reason: {DropReason.EMPTY_PDF}")
-        result = {"need_drop": True, "drop_reason": DropReason.EMPTY_PDF}
+        logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
+        result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
        return result
    else:
        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
@@ -322,7 +322,6 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):

        # 最后输出一条json
        res = {
-            "pdf_path": s3_pdf_path,
            "is_needs_password": is_needs_password,
            "is_encrypted": is_encrypted,
            "total_page": total_page,
@@ -350,7 +349,7 @@ def main(s3_pdf_path: str, s3_profile: str):
    """
    try:
        file_content = read_file(s3_pdf_path, s3_profile)
-        pdf_meta_scan(s3_pdf_path, file_content)
+        pdf_meta_scan(file_content)
    except Exception as e:
        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
        logger.exception(e)

--- a/magic_pdf/libs/ModelBlockTypeEnum.py
+++ b/magic_pdf/libs/ModelBlockTypeEnum.py
+from enum import Enum
+
+class ModelBlockTypeEnum(Enum):
+    TITLE = 0
+    PLAIN_TEXT = 1
+    ABANDON = 2
+    ISOLATE_FORMULA = 8
+    EMBEDDING = 13
+    ISOLATED = 14
\ No newline at end of file
--- a/magic_pdf/libs/boxbase.py
+++ b/magic_pdf/libs/boxbase.py


 from loguru import logger
-
+import math

 def _is_in_or_part_overlap(box1, box2) -> bool:
    """
@@ -332,3 +332,42 @@ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
        return right_boxes[0]
    else:
        return None
+
+
+def bbox_relative_pos(bbox1, bbox2):
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    
+    left = x2b < x1
+    right = x1b < x2
+    bottom = y2b < y1
+    top = y1b < y2
+    return left, right, bottom, top
+    
+def bbox_distance(bbox1, bbox2):
+    def dist(point1, point2):
+            return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
+    
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    
+    left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+    
+    if top and left:
+        return dist((x1, y1b), (x2b, y2))
+    elif left and bottom:
+        return dist((x1, y1), (x2b, y2b))
+    elif bottom and right:
+        return dist((x1b, y1), (x2, y2b))
+    elif right and top:
+        return dist((x1b, y1b), (x2, y2))
+    elif left:
+        return x1 - x2b
+    elif right:
+        return x2 - x1b
+    elif bottom:
+        return y1 - y2b
+    elif top:
+        return y2 - y1b
+    else:             # rectangles intersect
+        return 0
\ No newline at end of file
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -2,6 +2,7 @@
 根据bucket的名字返回对应的s3 AK， SK，endpoint三元组

 """
+
 import json
 import os

@@ -10,20 +11,24 @@ from loguru import logger
 from magic_pdf.libs.commons import parse_bucket_key


-def get_s3_config(bucket_name: str):
-    """
-    ~/magic-pdf.json 读出来
-    """
-
+def read_config():
    home_dir = os.path.expanduser("~")

    config_file = os.path.join(home_dir, "magic-pdf.json")

    if not os.path.exists(config_file):
-        raise Exception("magic-pdf.json not found")
+        raise Exception(f"{config_file} not found")

    with open(config_file, "r") as f:
        config = json.load(f)
+    return config
+
+
+def get_s3_config(bucket_name: str):
+    """
+    ~/magic-pdf.json 读出来
+    """
+    config = read_config()

    bucket_info = config.get("bucket_info")
    if bucket_name not in bucket_info:
@@ -49,5 +54,10 @@ def get_bucket_name(path):
    return bucket


-if __name__ == '__main__':
+def get_local_dir():
+    config = read_config()
+    return config.get("temp-output-dir", "/tmp")
+
+
+if __name__ == "__main__":
    ak, sk, endpoint = get_s3_config("llm-raw")
--- a/magic_pdf/libs/convert_utils.py
+++ b/magic_pdf/libs/convert_utils.py
+def dict_to_list(input_dict):
+    items_list = []
+    for _, item in input_dict.items():
+        items_list.append(item)
+    return items_list
--- a/magic_pdf/libs/coordinate_transform.py
+++ b/magic_pdf/libs/coordinate_transform.py
-def get_scale_ratio(ocr_page_info, page):
+def get_scale_ratio(model_page_info, page):
    pix = page.get_pixmap(dpi=72)
    pymu_width = int(pix.w)
    pymu_height = int(pix.h)
-    width_from_json = ocr_page_info['page_info']['width']
-    height_from_json = ocr_page_info['page_info']['height']
+    width_from_json = model_page_info['page_info']['width']
+    height_from_json = model_page_info['page_info']['height']
    horizontal_scale_ratio = width_from_json / pymu_width
    vertical_scale_ratio = height_from_json / pymu_height
    return horizontal_scale_ratio, vertical_scale_ratio
--- a/magic_pdf/libs/detect_language_from_model.py
+++ b/magic_pdf/libs/detect_language_from_model.py
+from collections import Counter
+
+from magic_pdf.libs.language import detect_lang
+
+def get_language_from_model(model_list: list):
+    language_lst = []
+    for ocr_page_info in model_list:
+        page_text = ""
+        layout_dets = ocr_page_info["layout_dets"]
+        for layout_det in layout_dets:
+            category_id = layout_det["category_id"]
+            allow_category_id_list = [15]
+            if category_id in allow_category_id_list:
+                page_text += layout_det["text"]
+        page_language = detect_lang(page_text)
+        language_lst.append(page_language)
+    # 统计text_language_list中每种语言的个数
+    count_dict = Counter(language_lst)
+    # 输出text_language_list中出现的次数最多的语言
+    language = max(count_dict, key=count_dict.get)
+    return language
--- a/magic_pdf/libs/drop_reason.py
+++ b/magic_pdf/libs/drop_reason.py
@@ -8,7 +8,7 @@ class DropReason:
    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图，计算量太大，从而丢弃
    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷，当前方法下计算量消耗过大
    MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
-    Exception = "exception" # 解析中发生异常
+    Exception = "_exception" # 解析中发生异常
    ENCRYPTED = "encrypted" # PDF是加密的
    EMPTY_PDF = "total_page=0" # PDF页面总数为0
    NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF，无法直接解析

--- a/magic_pdf/libs/drop_tag.py
+++ b/magic_pdf/libs/drop_tag.py
@@ -16,3 +16,4 @@ class DropTag:
    FOOTNOTE = "footnote"
    NOT_IN_LAYOUT = "not_in_layout"
    SPAN_OVERLAP = "span_overlap"
+    BLOCK_OVERLAP = "block_overlap"
--- a/magic_pdf/libs/math.py
+++ b/magic_pdf/libs/math.py
+def float_gt(a, b):
+    if 0.0001 >= abs(a -b):
+        return False
+    return a > b
+    
+def float_equal(a, b):
+    if 0.0001 >= abs(a-b):
+        return True
+    return False
\ No newline at end of file
--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
@@ -4,4 +4,17 @@ class ContentType:
    Text = "text"
    InlineEquation = "inline_equation"
    InterlineEquation = "interline_equation"
+    
+class BlockType:
+    Image = "image"
+    ImageBody = "image_body"
+    ImageCaption = "image_caption"
+    Table = "table"
+    TableBody = "table_body"
+    TableCaption = "table_caption"
+    TableFootnote = "table_footnote"
+    Text = "text"
+    Title = "title"
+    InterlineEquation = "interline_equation"
+    Footnote = "footnote"

--- a/magic_pdf/libs/path_utils.py
+++ b/magic_pdf/libs/path_utils.py
+
+
+from s3pathlib import S3Path
+
+def remove_non_official_s3_args(s3path):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
+    """
+    arr = s3path.split("?")
+    return arr[0]
+
+def parse_s3path(s3path: str):
+    p = S3Path(remove_non_official_s3_args(s3path))
+    return p.bucket, p.key
+
+def parse_s3_range_params(s3path: str):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
+    """
+    arr = s3path.split("?bytes=")
+    if len(arr) == 1:
+        return None
+    return arr[1].split(",")
--- a/magic_pdf/libs/pdf_image_tools.py
+++ b/magic_pdf/libs/pdf_image_tools.py

+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.commons import fitz
-from loguru import logger
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256


-def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
+def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
    """
    从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径
    save_path：需要同时支持s3和本地, 图片存放在save_path下，文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
@@ -28,49 +28,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri

    byte_data = pix.tobytes(output='jpeg', jpg_quality=95)

-    imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
+    imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)

    return img_hash256_path
-
-
-def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
-                          image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
-                          equation_inline_bboxes: list,
-                          equation_interline_bboxes: list, imageWriter) -> dict:
-    """
-    返回一个dict, key为bbox, 值是图片地址
-    """
-    image_info = []
-    image_backup_info = []
-    table_info = []
-    inline_eq_info = []
-    interline_eq_info = []
-
-    # 图片的保存路径组成是这样的： {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
-
-    def return_path(type):
-        return join_path(pdf_bytes_md5, type)
-
-    for bbox in image_bboxes:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"image_bboxes: 错误的box, {bbox}")
-            continue
-
-        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
-        image_info.append({"bbox": bbox, "image_path": image_path})
-
-    for bbox in images_overlap_backup:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
-            continue
-        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
-        image_backup_info.append({"bbox": bbox, "image_path": image_path})
-
-    for bbox in table_bboxes:
-        if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-            logger.warning(f"table_bboxes: 错误的box, {bbox}")
-            continue
-        image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
-        table_info.append({"bbox": bbox, "image_path": image_path})
-
-    return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
\ No newline at end of file