Merge pull request #1063 from opendatalab/release-0.10.0

Release 0.10.0

Merge pull request #1063 from opendatalab/release-0.10.0
Release 0.10.0
158e556b · Xiaomeng Zhao · GitHub · 038f48d3 · 30be5017 · 158e556b
Unverified Commit 158e556b authored Nov 22, 2024 by Xiaomeng Zhao Committed by GitHub Nov 22, 2024
20 changed files
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -3,13 +3,6 @@
 name: mineru
 on:
-  push:
-    branches:
-      - "master"
-      - "dev"
-    paths-ignore:
-      - "cmds/**"
-      - "**.md"
  pull_request:
    branches:
      - "master"

--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -20,6 +20,7 @@ jobs:
        source activate mineru
        conda env list
        pip show coverage
+        git checkout "dev"
        # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
        cd $GITHUB_WORKSPACE && python tests/clean_coverage.py      
        cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/  --cov-report html --cov-report term-missing

--- a/.github/workflows/huigui.yml
+++ b/.github/workflows/huigui.yml
@@ -10,7 +10,6 @@ on:
    paths-ignore:
      - "cmds/**"
      - "**.md"
-  workflow_dispatch:
 jobs:
  cli-test:
    if: github.repository == 'opendatalab/MinerU'

--- a/README.md
+++ b/README.md
@@ -42,6 +42,9 @@
 </div>
 # Changelog
+- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
+  - Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
+  - Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
 - 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
 - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
 - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -42,6 +42,9 @@
 </div>
 # 更新记录
+- 2024/11/22 0.10.0发布，通过引入混合OCR文本提取能力，
+  - 在公式密集、span区域不规范、部分文本使用图像表现等复杂文本分布场景下获得解析效果的显著提升
+  - 同时具备文本模式内容提取准确、速度更快与OCR模式span/line区域识别更准的双重优势
 - 2024/11/15 0.9.3发布，为表格识别功能接入了[RapidTable](https://github.com/RapidAI/RapidTable),单表解析速度提升10倍以上，准确率更高，显存占用更低
 - 2024/11/06 0.9.2发布，为表格识别功能接入了[StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B)模型
 - 2024/10/31 0.9.0发布，这是我们进行了大量代码重构的全新版本，解决了众多问题，提升了性能，降低了硬件需求，并提供了更丰富的易用性：

--- a/magic_pdf/libs/Constants.py
+++ b/magic_pdf/libs/Constants.py
-"""
+"""span维度自定义字段."""
-span维度自定义字段
-"""
 # span是否是跨页合并的
-CROSS_PAGE = "cross_page"
+CROSS_PAGE = 'cross_page'
 """
 block维度自定义字段
 """
 # block中lines是否被删除
-LINES_DELETED = "lines_deleted"
+LINES_DELETED = 'lines_deleted'
 # table recognition max time default value
 TABLE_MAX_TIME_VALUE = 400
@@ -17,39 +15,39 @@ TABLE_MAX_TIME_VALUE = 400
 TABLE_MAX_LEN = 480
 # table master structure dict
-TABLE_MASTER_DICT = "table_master_structure_dict.txt"
+TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
 # table master dir
-TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
+TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
 # pp detect model dir
-DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
+DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
 # pp rec model dir
-REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
+REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
 # pp rec char dict path
-REC_CHAR_DICT = "ppocr_keys_v1.txt"
+REC_CHAR_DICT = 'ppocr_keys_v1.txt'
 # pp rec copy rec directory
-PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
+PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
 # pp rec copy det directory
-PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
+PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
 class MODEL_NAME:
    # pp table structure algorithm
-    TABLE_MASTER = "tablemaster"
+    TABLE_MASTER = 'tablemaster'
    # struct eqtable
-    STRUCT_EQTABLE = "struct_eqtable"
+    STRUCT_EQTABLE = 'struct_eqtable'
-    DocLayout_YOLO = "doclayout_yolo"
+    DocLayout_YOLO = 'doclayout_yolo'
-    LAYOUTLMv3 = "layoutlmv3"
+    LAYOUTLMv3 = 'layoutlmv3'
-    YOLO_V8_MFD = "yolo_v8_mfd"
+    YOLO_V8_MFD = 'yolo_v8_mfd'
-    UniMerNet_v2_Small = "unimernet_small"
+    UniMerNet_v2_Small = 'unimernet_small'
-    RAPID_TABLE = "rapid_table"
+    RAPID_TABLE = 'rapid_table'
\ No newline at end of file
--- a/magic_pdf/config/drop_reason.py
+++ b/magic_pdf/config/drop_reason.py
+class DropReason:
+    TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap'  # 文字块有水平互相覆盖，导致无法准确定位文字顺序
+    USEFUL_BLOCK_HOR_OVERLAP = (
+        'useful_block_horizontal_overlap'  # 需保留的block水平覆盖
+    )
+    COMPLICATED_LAYOUT = 'complicated_layout'  # 复杂的布局，暂时不支持
+    TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns'  # 目前不支持分栏超过2列的
+    COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box'  # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
+    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
+        'high_computational_load_by_imgs'  # 含特殊图片，计算量太大，从而丢弃
+    )
+    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
+        'high_computational_load_by_svgs'  # 特殊的SVG图，计算量太大，从而丢弃
+    )
+    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages'  # 计算量超过负荷，当前方法下计算量消耗过大
+    MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result'  # 版面分析失败
+    Exception = '_exception'  # 解析中发生异常
+    ENCRYPTED = 'encrypted'  # PDF是加密的
+    EMPTY_PDF = 'total_page=0'  # PDF页面总数为0
+    NOT_IS_TEXT_PDF = 'not_is_text_pdf'  # 不是文字版PDF，无法直接解析
+    DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block'  # 无法清晰的分段
+    TITLE_DETECTION_FAILED = 'title_detection_failed'  # 探测标题失败
+    TITLE_LEVEL_FAILED = (
+        'title_level_failed'  # 分析标题级别失败（例如一级、二级、三级标题）
+    )
+    PARA_SPLIT_FAILED = 'para_split_failed'  # 识别段落失败
+    PARA_MERGE_FAILED = 'para_merge_failed'  # 段落合并失败
+    NOT_ALLOW_LANGUAGE = 'not_allow_language'  # 不支持的语种
+    SPECIAL_PDF = 'special_pdf'
+    PSEUDO_SINGLE_COLUMN = 'pseudo_single_column'  # 无法精确判断文字分栏
+    CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout'  # 无法分析页面的版面
+    NEGATIVE_BBOX_AREA = 'negative_bbox_area'  # 缩放导致 bbox 面积为负
+    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
+        'overlap_blocks_can_t_separation'  # 无法分离重叠的block
+    )
--- a/magic_pdf/config/drop_tag.py
+++ b/magic_pdf/config/drop_tag.py
+COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
+PAGE_NO = 'page-no'  # 页码
+CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area'  # 页眉页脚内的文本
+VERTICAL_TEXT = 'vertical-text'  # 垂直文本
+ROTATE_TEXT = 'rotate-text'  # 旋转文本
+EMPTY_SIDE_BLOCK = 'empty-side-block'  # 边缘上的空白没有任何内容的block
+ON_IMAGE_TEXT = 'on-image-text'  # 文本在图片上
+ON_TABLE_TEXT = 'on-table-text'  # 文本在表格上
+class DropTag:
+    PAGE_NUMBER = 'page_no'
+    HEADER = 'header'
+    FOOTER = 'footer'
+    FOOTNOTE = 'footnote'
+    NOT_IN_LAYOUT = 'not_in_layout'
+    SPAN_OVERLAP = 'span_overlap'
+    BLOCK_OVERLAP = 'block_overlap'
--- a/magic_pdf/config/make_content_config.py
+++ b/magic_pdf/config/make_content_config.py
+class MakeMode:
+    MM_MD = 'mm_markdown'
+    NLP_MD = 'nlp_markdown'
+    STANDARD_FORMAT = 'standard_format'
+class DropMode:
+    WHOLE_PDF = 'whole_pdf'
+    SINGLE_PAGE = 'single_page'
+    NONE = 'none'
+    NONE_WITH_REASON = 'none_with_reason'
--- a/magic_pdf/libs/ModelBlockTypeEnum.py
+++ b/magic_pdf/libs/ModelBlockTypeEnum.py
 from enum import Enum
 class ModelBlockTypeEnum(Enum):
    TITLE = 0
    PLAIN_TEXT = 1
    ABANDON = 2
    ISOLATE_FORMULA = 8
    EMBEDDING = 13
    ISOLATED = 14
\ No newline at end of file
--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
@@ -35,7 +35,7 @@ def read_jsonl(
    jsonl_d = [
        json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
    ]
-    for d in jsonl_d[:5]:
+    for d in jsonl_d:
        pdf_path = d.get('file_location', '') or d.get('path', '')
        if len(pdf_path) == 0:
            raise EmptyData('pdf file location is empty')

--- a/magic_pdf/dict2md/mkcontent.py
+++ b/magic_pdf/dict2md/mkcontent.py
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -2,21 +2,20 @@ import re
 from loguru import logger
+from magic_pdf.config.make_content_config import DropMode, MakeMode
+from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.libs.ocr_content_type import BlockType, ContentType
 from magic_pdf.para.para_split_v3 import ListLineTag
 def __is_hyphen_at_line_end(line):
-    """
+    """Check if a line ends with one or more letters followed by a hyphen.
-    Check if a line ends with one or more letters followed by a hyphen.
    Args:
    line (str): The line of text to check.
    Returns:
    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
    """
@@ -142,9 +141,10 @@ def merge_para_with_text(para_block):
            span_type = span['type']
            if span_type == ContentType.Text:
                line_text += span['content'].strip()
        if line_text != '':
            line_lang = detect_lang(line_text)
-        for span in line['spans']:
+        for j, span in enumerate(line['spans']):
            span_type = span['type']
            content = ''
@@ -162,16 +162,16 @@ def merge_para_with_text(para_block):
                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
                        para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
                    elif span_type == ContentType.InlineEquation:
-                        para_text += f" {content} "
+                        para_text += f' {content} '
                else:
                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
-                        # 如果是前一行带有-连字符，那么末尾不应该加空格
+                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
-                        if __is_hyphen_at_line_end(content):
+                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
                            para_text += content[:-1]
                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
                            para_text += content
                        else:  # 西方文本语境下 content间需要空格分隔
-                            para_text += f"{content} "
+                            para_text += f'{content} '
                    elif span_type == ContentType.InterlineEquation:
                        para_text += content
            else:

--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
-"""
+"""输入： s3路径，每行一个 输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置."""
-输入： s3路径，每行一个
-输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置
-"""
 import sys
-import click
+from collections import Counter
-from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
+import click
-from magic_pdf.libs.commons import fitz
 from loguru import logger
-from collections import Counter
-from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.config.drop_reason import DropReason
+from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.pdf_check import detect_invalid_chars
@@ -19,8 +16,10 @@ junk_limit_min = 10
 def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
+    max_image_area_per_page = [
-                               result]
+        mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz])
+        for page_img_sz in result
+    ]
    page_area = int(page_width_pts) * int(page_height_pts)
    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
@@ -32,8 +31,10 @@ def process_image(page, junk_img_bojids=[]):
    items = page.get_images()
    dedup = set()
    for img in items:
-        # 这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
+        #  这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
-        img_bojid = img[0]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
+        img_bojid = img[
+            0
+        ]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
            continue
        recs = page.get_image_rects(img, transform=True)
@@ -42,9 +43,17 @@ def process_image(page, junk_img_bojids=[]):
            x0, y0, x1, y1 = map(int, rec)
            width = x1 - x0
            height = y1 - y0
-            if (x0, y0, x1, y1, img_bojid) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
+            if (
+                x0,
+                y0,
+                x1,
+                y1,
+                img_bojid,
+            ) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
                continue
-            if not all([width, height]):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
+            if not all(
+                [width, height]
+            ):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
                continue
            dedup.add((x0, y0, x1, y1, img_bojid))
            page_result.append([x0, y0, x1, y1, img_bojid])
@@ -52,29 +61,33 @@ def process_image(page, junk_img_bojids=[]):
 def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
-    """
+    """返回每个页面里的图片的四元组，每个页面多个图片。
-    返回每个页面里的图片的四元组，每个页面多个图片。
    :param doc:
    :return:
    """
-    # 使用 Counter 计数 img_bojid 的出现次数
+    #  使用 Counter 计数 img_bojid 的出现次数
    img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
-    # 找出出现次数超过 len(doc) 半数的 img_bojid
+    #  找出出现次数超过 len(doc) 半数的 img_bojid
    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
-    junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
+    junk_img_bojids = [
+        img_bojid
-    #todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
+        for img_bojid, count in img_bojid_counter.items()
-    #有两种扫描版，一种文字版，这里可能会有误判
+        if count >= junk_limit
-    #扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
+    ]
-    #扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
-    #文字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
+    #  todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
+    #  有两种扫描版，一种文字版，这里可能会有误判
+    #  扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
+    #  扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
+    # 文  字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
    imgs_len_list = [len(page.get_images()) for page in doc]
    special_limit_pages = 10
-    # 统一用前十页结果做判断
+    #  统一用前十页结果做判断
    result = []
    break_loop = False
    for i, page in enumerate(doc):
@@ -82,12 +95,18 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
            break
        if i >= special_limit_pages:
            break
-        page_result = process_image(page)  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
+        page_result = process_image(
+            page
+        )  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
        result.append(page_result)
        for item in result:
-            if not any(item):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
+            if not any(
-                if max(imgs_len_list) == min(imgs_len_list) and max(
+                item
-                        imgs_len_list) >= junk_limit_min:  # 如果是特殊文字版，就把junklist置空并break
+            ):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
+                if (
+                    max(imgs_len_list) == min(imgs_len_list)
+                    and max(imgs_len_list) >= junk_limit_min
+                ):  # 如果是特殊文字版，就把junklist置空并break
                    junk_img_bojids = []
                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
                    pass
@@ -98,20 +117,23 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
        top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
        # 检查前80%的元素是否都相等
        if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
-            #前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
+            # 前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
-            max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
+            max_image_area_per_page = calculate_max_image_area_per_page(
-            if len(max_image_area_per_page) < 0.8 * special_limit_pages:  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
+                result, page_width_pts, page_height_pts
+            )
+            if (
+                len(max_image_area_per_page) < 0.8 * special_limit_pages
+            ):  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
                junk_img_bojids = []
            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
                pass
        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
            junk_img_bojids = []
-    #正式进入取前50页图片的信息流程
+    # 正式进入取前50页图片的信息流程
    result = []
    for i, page in enumerate(doc):
        if i >= scan_max_page:
@@ -126,7 +148,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
 def get_pdf_page_size_pts(doc: fitz.Document):
    page_cnt = len(doc)
    l: int = min(page_cnt, 50)
-    #把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
+    # 把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
    page_width_list = []
    page_height_list = []
    for i in range(l):
@@ -152,8 +174,8 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
        # 拿所有text的blocks
        # text_block = page.get_text("words")
        # text_block_len = sum([len(t[4]) for t in text_block])
-        #拿所有text的str
+        # 拿所有text的str
-        text_block = page.get_text("text")
+        text_block = page.get_text('text')
        text_block_len = len(text_block)
        # logger.info(f"page {page.number} text_block_len: {text_block_len}")
        text_len_lst.append(text_block_len)
@@ -162,15 +184,13 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
 def get_pdf_text_layout_per_page(doc: fitz.Document):
-    """
+    """根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
-    根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
    Args:
        doc (fitz.Document): PDF文档对象。
    Returns:
        List[str]: 每一页的文本布局（横向、纵向、未知）。
    """
    text_layout_list = []
@@ -180,11 +200,11 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
        # 创建每一页的纵向和横向的文本行数计数器
        vertical_count = 0
        horizontal_count = 0
-        text_dict = page.get_text("dict")
+        text_dict = page.get_text('dict')
-        if "blocks" in text_dict:
+        if 'blocks' in text_dict:
-            for block in text_dict["blocks"]:
+            for block in text_dict['blocks']:
                if 'lines' in block:
-                    for line in block["lines"]:
+                    for line in block['lines']:
                        # 获取line的bbox顶点坐标
                        x0, y0, x1, y1 = line['bbox']
                        # 计算bbox的宽高
@@ -199,8 +219,12 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
                        if len(font_sizes) > 0:
                            average_font_size = sum(font_sizes) / len(font_sizes)
                        else:
-                            average_font_size = 10  # 有的line拿不到font_size，先定一个阈值100
+                            average_font_size = (
-                        if area <= average_font_size ** 2:  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
+                                10  # 有的line拿不到font_size，先定一个阈值100
+                            )
+                        if (
+                            area <= average_font_size**2
+                        ):  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
                            continue
                        else:
                            if 'wmode' in line:  # 通过wmode判断文本方向
@@ -228,22 +252,22 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
        # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
        # 判断每一页的文本布局
        if vertical_count == 0 and horizontal_count == 0:  # 该页没有文本，无法判断
-            text_layout_list.append("unknow")
+            text_layout_list.append('unknow')
            continue
        else:
            if vertical_count > horizontal_count:  # 该页的文本纵向行数大于横向的
-                text_layout_list.append("vertical")
+                text_layout_list.append('vertical')
            else:  # 该页的文本横向行数大于纵向的
-                text_layout_list.append("horizontal")
+                text_layout_list.append('horizontal')
        # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
    return text_layout_list
-'''定义一个自定义异常用来抛出单页svg太多的pdf'''
+"""定义一个自定义异常用来抛出单页svg太多的pdf"""
 class PageSvgsTooManyError(Exception):
-    def __init__(self, message="Page SVGs are too many"):
+    def __init__(self, message='Page SVGs are too many'):
        self.message = message
        super().__init__(self.message)
@@ -285,7 +309,7 @@ def get_language(doc: fitz.Document):
        if page_id >= scan_max_page:
            break
        # 拿所有text的str
-        text_block = page.get_text("text")
+        text_block = page.get_text('text')
        page_language = detect_lang(text_block)
        language_lst.append(page_language)
@@ -299,9 +323,7 @@ def get_language(doc: fitz.Document):
 def check_invalid_chars(pdf_bytes):
-    """
+    """乱码检测."""
-    乱码检测
-    """
    return detect_invalid_chars(pdf_bytes)
@@ -311,13 +333,13 @@ def pdf_meta_scan(pdf_bytes: bytes):
    :param pdf_bytes: pdf文件的二进制数据
    几个维度来评价：是否加密，是否需要密码，纸张大小，总页数，是否文字可提取
    """
-    doc = fitz.open("pdf", pdf_bytes)
+    doc = fitz.open('pdf', pdf_bytes)
    is_needs_password = doc.needs_pass
    is_encrypted = doc.is_encrypted
    total_page = len(doc)
    if total_page == 0:
-        logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
+        logger.warning(f'drop this pdf, drop_reason: {DropReason.EMPTY_PDF}')
-        result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
+        result = {'_need_drop': True, '_drop_reason': DropReason.EMPTY_PDF}
        return result
    else:
        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
@@ -328,7 +350,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
        imgs_per_page = get_imgs_per_page(doc)
        # logger.info(f"imgs_per_page: {imgs_per_page}")
-        image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
+        image_info_per_page, junk_img_bojids = get_image_info(
+            doc, page_width_pts, page_height_pts
+        )
        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
        text_len_per_page = get_pdf_textlen_per_page(doc)
        # logger.info(f"text_len_per_page: {text_len_per_page}")
@@ -341,20 +365,20 @@ def pdf_meta_scan(pdf_bytes: bytes):
        # 最后输出一条json
        res = {
-            "is_needs_password": is_needs_password,
+            'is_needs_password': is_needs_password,
-            "is_encrypted": is_encrypted,
+            'is_encrypted': is_encrypted,
-            "total_page": total_page,
+            'total_page': total_page,
-            "page_width_pts": int(page_width_pts),
+            'page_width_pts': int(page_width_pts),
-            "page_height_pts": int(page_height_pts),
+            'page_height_pts': int(page_height_pts),
-            "image_info_per_page": image_info_per_page,
+            'image_info_per_page': image_info_per_page,
-            "text_len_per_page": text_len_per_page,
+            'text_len_per_page': text_len_per_page,
-            "text_layout_per_page": text_layout_per_page,
+            'text_layout_per_page': text_layout_per_page,
-            "text_language": text_language,
+            'text_language': text_language,
            # "svgs_per_page": svgs_per_page,
-            "imgs_per_page": imgs_per_page,  # 增加每页img数量list
+            'imgs_per_page': imgs_per_page,  # 增加每页img数量list
-            "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
+            'junk_img_bojids': junk_img_bojids,  # 增加垃圾图片的bojid list
-            "invalid_chars": invalid_chars,
+            'invalid_chars': invalid_chars,
-            "metadata": doc.metadata
+            'metadata': doc.metadata,
        }
        # logger.info(json.dumps(res, ensure_ascii=False))
        return res
@@ -364,14 +388,12 @@ def pdf_meta_scan(pdf_bytes: bytes):
 @click.option('--s3-pdf-path', help='s3上pdf文件的路径')
 @click.option('--s3-profile', help='s3上的profile')
 def main(s3_pdf_path: str, s3_profile: str):
-    """
+    """"""
-    """
    try:
        file_content = read_file(s3_pdf_path, s3_profile)
        pdf_meta_scan(file_content)
    except Exception as e:
-        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
+        print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
        logger.exception(e)
@@ -381,7 +403,7 @@ if __name__ == '__main__':
    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
    # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
-    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
+    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")  # noqa: E501
    # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
    # doc = fitz.open("pdf", file_content)
    # text_layout_lst = get_pdf_text_layout_per_page(doc)

--- a/magic_pdf/integrations/rag/utils.py
+++ b/magic_pdf/integrations/rag/utils.py
@@ -5,14 +5,13 @@ from pathlib import Path
 from loguru import logger
 import magic_pdf.model as model_config
+from magic_pdf.config.ocr_content_type import BlockType, ContentType
+from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
 from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
                                             ElementRelation, ElementRelType,
                                             LayoutElements,
                                             LayoutElementsExtra, PageInfo)
-from magic_pdf.libs.ocr_content_type import BlockType, ContentType
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.tools.common import do_parse, prepare_env
@@ -224,8 +223,8 @@ def inference(path, output_dir, method):
                                                str(Path(path).stem), method)
    def read_fn(path):
-        disk_rw = DiskReaderWriter(os.path.dirname(path))
+        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
+        return disk_rw.read(os.path.basename(path))
    def parse_doc(doc_path: str):
        try:

--- a/magic_pdf/libs/MakeContentConfig.py
+++ b/magic_pdf/libs/MakeContentConfig.py
-class MakeMode:
-    MM_MD = "mm_markdown"
-    NLP_MD = "nlp_markdown"
-    STANDARD_FORMAT = "standard_format"
-class DropMode:
-    WHOLE_PDF = "whole_pdf"
-    SINGLE_PAGE = "single_page"
-    NONE = "none"
-    NONE_WITH_REASON = "none_with_reason"
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -5,7 +5,7 @@ import os
 from loguru import logger
-from magic_pdf.libs.Constants import MODEL_NAME
+from magic_pdf.config.constants import MODEL_NAME
 from magic_pdf.libs.commons import parse_bucket_key
 # 定义配置文件名常量
@@ -99,7 +99,7 @@ def get_table_recog_config():
 def get_layout_config():
    config = read_config()
-    layout_config = config.get("layout-config")
+    layout_config = config.get('layout-config')
    if layout_config is None:
        logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
        return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
@@ -109,7 +109,7 @@ def get_layout_config():
 def get_formula_config():
    config = read_config()
-    formula_config = config.get("formula-config")
+    formula_config = config.get('formula-config')
    if formula_config is None:
        logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
        return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
@@ -117,5 +117,5 @@ def get_formula_config():
        return formula_config
-if __name__ == "__main__":
+if __name__ == '__main__':
-    ak, sk, endpoint = get_s3_config("llm-raw")
+    ak, sk, endpoint = get_s3_config('llm-raw')
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
+from magic_pdf.config.constants import CROSS_PAGE
+from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
+                                               ContentType)
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.libs.commons import fitz  # PyMuPDF
-from magic_pdf.libs.Constants import CROSS_PAGE
-from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
 from magic_pdf.model.magic_model import MagicModel

--- a/magic_pdf/libs/drop_reason.py
+++ b/magic_pdf/libs/drop_reason.py
-class DropReason:
-    TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖，导致无法准确定位文字顺序
-    USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
-    COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局，暂时不支持
-    TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
-    COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
-    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片，计算量太大，从而丢弃
-    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图，计算量太大，从而丢弃
-    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷，当前方法下计算量消耗过大
-    MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
-    Exception = "_exception" # 解析中发生异常
-    ENCRYPTED = "encrypted" # PDF是加密的
-    EMPTY_PDF = "total_page=0" # PDF页面总数为0
-    NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF，无法直接解析
-    DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
-    TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
-    TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败（例如一级、二级、三级标题）
-    PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
-    PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
-    NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
-    SPECIAL_PDF = "special_pdf"
-    PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
-    CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
-    NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
-    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
\ No newline at end of file