Merge pull request #1120 from opendatalab/release-0.10.2

Release 0.10.2

Merge pull request #1120 from opendatalab/release-0.10.2
Release 0.10.2
8afff9ae · Xiaomeng Zhao · GitHub · 4df1eb74 · 7fdbb6e5 · 8afff9ae
Unverified Commit 8afff9ae authored Nov 27, 2024 by Xiaomeng Zhao Committed by GitHub Nov 27, 2024
20 changed files
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
 import enum
-import json

 from magic_pdf.config.model_block_type import ModelBlockTypeEnum
 from magic_pdf.config.ocr_content_type import CategoryId, ContentType
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               FileBasedDataWriter)
 from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
                                    bbox_relative_pos, box_area, calculate_iou,
                                    calculate_overlap_area_in_bbox1_area_ratio,
                                    get_overlap_area)
-from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.local_math import float_gt
 from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
@@ -1048,29 +1044,3 @@ class MagicModel:
    def get_model_list(self, page_no):
        return self.__model_list[page_no]

-
-if __name__ == '__main__':
-    drw = FileBasedDataReader(r'D:/project/20231108code-clean')
-    if 0:
-        pdf_file_path = r'linshixuqiu\19983-00.pdf'
-        model_file_path = r'linshixuqiu\19983-00_new.json'
-        pdf_bytes = drw.read(pdf_file_path)
-        model_json_txt = drw.read(model_file_path).decode()
-        model_list = json.loads(model_json_txt)
-        write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
-        img_bucket_path = 'imgs'
-        img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
-        pdf_docs = fitz.open('pdf', pdf_bytes)
-        magic_model = MagicModel(model_list, pdf_docs)
-
-    if 1:
-        from magic_pdf.data.dataset import PymuDocDataset
-
-        model_list = json.loads(
-            drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
-        )
-        pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
-
-        magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
-        for i in range(7):
-            print(magic_model.get_imgs(i))
--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
-import math
-
 import numpy as np
 from loguru import logger

@@ -214,6 +212,9 @@ def get_ocr_result_list(ocr_res, useful_list):
        if len(box_ocr_res) == 2:
            p1, p2, p3, p4 = box_ocr_res[0]
            text, score = box_ocr_res[1]
+            # logger.info(f"text: {text}, score: {score}")
+            if score < 0.6:  # 过滤低置信度的结果
+                continue
        else:
            p1, p2, p3, p4 = box_ocr_res
            text, score = "", 1
@@ -249,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
    return ocr_result_list


-def calculate_angle_degrees(poly):
-    # 定义对角线的顶点
-    diagonal1 = (poly[0], poly[2])
-    diagonal2 = (poly[1], poly[3])
-
-    # 计算对角线的斜率
-    def slope(p1, p2):
-        return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
-
-    slope1 = slope(diagonal1[0], diagonal1[1])
-    slope2 = slope(diagonal2[0], diagonal2[1])
-
-    # 计算对角线与x轴的夹角（以弧度为单位）
-    angle1_radians = math.atan(slope1)
-    angle2_radians = math.atan(slope2)
-
-    # 将弧度转换为角度
-    angle1_degrees = math.degrees(angle1_radians)
-    angle2_degrees = math.degrees(angle2_radians)
-
-    # 取两条对角线与x轴夹角的平均值
-    average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
-    # logger.info(f"average_angle_degrees: {average_angle_degrees}")
-    return average_angle_degrees
-
-
 def calculate_is_angle(poly):
    p1, p2, p3, p4 = poly
    height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2

--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):

        if det and rec:
            ocr_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                img = preprocess_image(img)
                dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
                if not dt_boxes and not rec_res:
@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
            return ocr_res
        elif det and not rec:
            ocr_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                img = preprocess_image(img)
                dt_boxes, elapse = self.text_detector(img)
                if dt_boxes is None:
@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
        else:
            ocr_res = []
            cls_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                if not isinstance(img, list):
                    img = preprocess_image(img)
                    img = [img]

--- a/magic_pdf/para/block_continuation_processor.py
+++ b/magic_pdf/para/block_continuation_processor.py
-import os
-import unicodedata
-
-from magic_pdf.para.commons import *
-
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-class BlockContinuationProcessor:
-    """
-    This class is used to process the blocks to detect block continuations.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
-        """
-        This function checks if the two font types are similar.
-        Definition of similar font types: the two font types have a common prefix,
-        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
-
-        Parameters
-        ----------
-        font_type1 : str
-            font type 1
-        font_type2 : str
-            font type 2
-        prefix_length_ratio : float
-            minimum ratio of the common prefix length to the length of the shorter font type
-
-        Returns
-        -------
-        bool
-            True if the two font types are similar, False otherwise.
-        """
-
-        if isinstance(font_type1, list):
-            font_type1 = font_type1[0] if font_type1 else ""
-        if isinstance(font_type2, list):
-            font_type2 = font_type2[0] if font_type2 else ""
-
-        if font_type1 == font_type2:
-            return True
-
-        # Find the length of the common prefix
-        common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
-
-        # Calculate the minimum prefix length based on the ratio
-        min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
-
-        return common_prefix_length >= min_prefix_length
-
-    def __is_same_block_font(self, block1, block2):
-        """
-        This function compares the font of block1 and block2
-
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 have the same font, else False
-        """
-        block_1_font_type = safe_get(block1, "block_font_type", "")
-        block_1_font_size = safe_get(block1, "block_font_size", 0)
-        block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
-
-        block_2_font_type = safe_get(block2, "block_font_type", "")
-        block_2_font_size = safe_get(block2, "block_font_size", 0)
-        block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
-
-        if isinstance(block_1_font_size, list):
-            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
-        if isinstance(block_2_font_size, list):
-            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
-
-        block_1_text = safe_get(block1, "text", "")
-        block_2_text = safe_get(block2, "text", "")
-
-        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
-            return False
-
-        if not block_1_text or not block_2_text:
-            return False
-        else:
-            text_len_ratio = len(block_2_text) / len(block_1_text)
-            if text_len_ratio < 0.2:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.5
-                )
-            else:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.2
-                )
-
-        block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
-
-        return (
-            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
-            and avg_char_width_condition
-            and block_font_size_condtion
-        )
-
-    def _is_alphabet_char(self, char):
-        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
-            return True
-        else:
-            return False
-
-    def _is_chinese_char(self, char):
-        if char >= "\u4e00" and char <= "\u9fa5":
-            return True
-        else:
-            return False
-
-    def _is_other_letter_char(self, char):
-        try:
-            cat = unicodedata.category(char)
-            if cat == "Lu" or cat == "Ll":
-                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
-        except TypeError:
-            print("The input to the function must be a single character.")
-        return False
-
-    def _is_year(self, s: str):
-        try:
-            number = int(s)
-            return 1900 <= number <= 2099
-        except ValueError:
-            return False
-
-    def __is_para_font_consistent(self, para_1, para_2):
-        """
-        This function compares the font of para1 and para2
-
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 have the same font, else False
-        """
-        if para_1 is None or para_2 is None:
-            return False
-
-        para_1_font_type = safe_get(para_1, "para_font_type", "")
-        para_1_font_size = safe_get(para_1, "para_font_size", 0)
-        para_1_font_color = safe_get(para_1, "para_font_color", "")
-
-        para_2_font_type = safe_get(para_2, "para_font_type", "")
-        para_2_font_size = safe_get(para_2, "para_font_size", 0)
-        para_2_font_color = safe_get(para_2, "para_font_color", "")
-
-        if isinstance(para_1_font_type, list):  # get the most common font type
-            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
-        if isinstance(para_2_font_type, list):
-            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
-        if isinstance(para_1_font_size, list):  # compute average font type
-            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
-        if isinstance(para_2_font_size, list):  # compute average font type
-            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
-
-        return (
-            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
-            and abs(para_1_font_size - para_2_font_size) < 1.5
-            # and para_font_color1 == para_font_color2
-        )
-
-    def _is_para_puncs_consistent(self, para_1, para_2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
-
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph by using the puncs, else False
-        """
-        para_1_text = safe_get(para_1, "para_text", "").strip()
-        para_2_text = safe_get(para_2, "para_text", "").strip()
-
-        para_1_bboxes = safe_get(para_1, "para_bbox", [])
-        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
-
-        para_2_bboxes = safe_get(para_2, "para_bbox", [])
-        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
-
-        # print_yellow("    Features of determine puncs_consistent:")
-        # print(f"    para_1_text: {para_1_text}")
-        # print(f"    para_2_text: {para_2_text}")
-        # print(f"    para_1_bboxes: {para_1_bboxes}")
-        # print(f"    para_2_bboxes: {para_2_bboxes}")
-        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
-        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
-
-        if is_nested_list(para_1_bboxes):
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
-        else:
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
-
-        if is_nested_list(para_2_bboxes):
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
-            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
-        else:
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
-
-        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
-
-        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
-        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
-
-        # Check if either para_text1 or para_text2 is empty
-        if not para_1_text or not para_2_text:
-            return False
-
-        # Define the end puncs for a sentence to end and hyphen
-        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-        hyphen = ["-", "—"]
-
-        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
-        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
-        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
-        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
-        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
-
-        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
-        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
-        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
-        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
-
-        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
-            # print_red(f"para_1 is end with hyphen.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] in hyphen
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-
-        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
-            # print_red(f"para_1 is end with end_punc.")
-            para_2_is_consistent = (
-                para_2_text
-                and (
-                    para_2_text[0] == " "
-                    or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
-                    or (self._is_chinese_char(para_2_text[0]))
-                    or (self._is_other_letter_char(para_2_text[0]))
-                )
-                and not is_para2_left_indent_than_papa1
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-
-        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
-            # print_red(f"para_1 is NOT end with end_punc.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_alphabet_char(para_2_text[0]))
-                or (self._is_year(para_2_text[0:4]))
-                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-
-        elif para_1_end_with_space:  # If para_text1 ends with space
-            # print_red(f"para_1 is end with space.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                pass
-                # print(f"para_2 is not consistent.\n")
-
-        return False
-
-    def _is_block_consistent(self, block1, block2):
-        """
-        This function determines whether block1 and block2 are originally from the same block
-
-        Parameters
-        ----------
-        block1 : dict
-            block1s
-        block2 : dict
-            block2
-
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 are from the same block, else False
-        """
-        return self.__is_same_block_font(block1, block2)
-
-    def _is_para_continued(self, para1, para2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph
-
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph, else False
-        """
-        is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
-        is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
-
-        return is_para_font_consistent and is_para_puncs_consistent
-
-    def _are_boundaries_of_block_consistent(self, block1, block2):
-        """
-        This function checks if the boundaries of block1 and block2 are consistent
-
-        Parameters
-        ----------
-        block1 : dict
-            block1
-
-        block2 : dict
-            block2
-
-        Returns
-        -------
-        is_consistent : bool
-            True if the boundaries of block1 and block2 are consistent, else False
-        """
-
-        last_line_of_block1 = block1["lines"][-1]
-        first_line_of_block2 = block2["lines"][0]
-
-        spans_of_last_line_of_block1 = last_line_of_block1["spans"]
-        spans_of_first_line_of_block2 = first_line_of_block2["spans"]
-
-        font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
-        font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
-        font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
-        font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
-
-        font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
-        font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
-        font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
-        font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
-
-        return (
-            self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
-            and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
-            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
-            and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
-        )
-
-    def _get_last_paragraph(self, block):
-        """
-        Retrieves the last paragraph from a block.
-
-        Parameters
-        ----------
-        block : dict
-            The block from which to retrieve the paragraph.
-
-        Returns
-        -------
-        dict
-            The last paragraph of the block.
-        """
-        if block["paras"]:
-            last_para_key = list(block["paras"].keys())[-1]
-            return block["paras"][last_para_key]
-        else:
-            return None
-
-    def _get_first_paragraph(self, block):
-        """
-        Retrieves the first paragraph from a block.
-
-        Parameters
-        ----------
-        block : dict
-            The block from which to retrieve the paragraph.
-
-        Returns
-        -------
-        dict
-            The first paragraph of the block.
-        """
-        if block["paras"]:
-            first_para_key = list(block["paras"].keys())[0]
-            return block["paras"][first_para_key]
-        else:
-            return None
-
-    def should_merge_next_para(self, curr_para, next_para):
-        if self._is_para_continued(curr_para, next_para):
-            return True
-        else:
-            return False
-
-    def batch_tag_paras(self, pdf_dict):
-        the_last_page_id = len(pdf_dict) - 1
-
-        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
-            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
-                para_blocks_of_curr_page = curr_page_content["para_blocks"]
-                next_page_idx = curr_page_idx + 1
-                next_page_id = f"page_{next_page_idx}"
-                next_page_content = pdf_dict.get(next_page_id, {})
-
-                for i, current_block in enumerate(para_blocks_of_curr_page):
-                    for para_id, curr_para in current_block["paras"].items():
-                        curr_para["curr_para_location"] = [
-                            curr_page_idx,
-                            current_block["block_id"],
-                            int(para_id.split("_")[-1]),
-                        ]
-                        curr_para["next_para_location"] = None  # 默认设置为None
-                        curr_para["merge_next_para"] = False  # 默认设置为False
-
-                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
-
-                    if next_block:
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-
-                        next_block_first_para_key = list(next_block["paras"].keys())[0]
-                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
-
-                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                            curr_blk_last_para["next_para_location"] = [
-                                curr_page_idx,
-                                next_block["block_id"],
-                                int(next_block_first_para_key.split("_")[-1]),
-                            ]
-                            curr_blk_last_para["merge_next_para"] = True
-                    else:
-                        # Handle the case where the next block is in a different page
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-
-                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
-                            next_page_idx += 1
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id, {})
-
-                        if next_page_content.get("para_blocks", []):
-                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
-                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
-
-                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                                curr_blk_last_para["next_para_location"] = [
-                                    next_page_idx,
-                                    next_page_content["para_blocks"][0]["block_id"],
-                                    int(next_blk_first_para_key.split("_")[-1]),
-                                ]
-                                curr_blk_last_para["merge_next_para"] = True
-
-        return pdf_dict
-
-    def find_block_by_id(self, para_blocks, block_id):
-        for block in para_blocks:
-            if block.get("block_id") == block_id:
-                return block
-        return None
-
-    def batch_merge_paras(self, pdf_dict):
-        for page_id, page_content in pdf_dict.items():
-            if page_id.startswith("page_") and page_content.get("para_blocks", []):
-                para_blocks_of_page = page_content["para_blocks"]
-
-                for i in range(len(para_blocks_of_page)):
-                    current_block = para_blocks_of_page[i]
-                    paras = current_block["paras"]
-
-                    for para_id, curr_para in list(paras.items()):
-                        # 跳过标题段落
-                        if curr_para.get("is_para_title"):
-                            continue
-
-                        while curr_para.get("merge_next_para"):
-                            next_para_location = curr_para.get("next_para_location")
-                            if not next_para_location:
-                                break
-
-                            next_page_idx, next_block_id, next_para_id = next_para_location
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id)
-                            if not next_page_content:
-                                break
-
-                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
-                            if not next_block:
-                                break
-
-                            next_para = next_block["paras"].get(f"para_{next_para_id}")
-                            if not next_para or next_para.get("is_para_title"):
-                                break
-
-                            # 合并段落文本
-                            curr_para_text = curr_para.get("para_text", "")
-                            next_para_text = next_para.get("para_text", "")
-                            curr_para["para_text"] = curr_para_text + " " + next_para_text
-
-                            # 更新 next_para_location
-                            curr_para["next_para_location"] = next_para.get("next_para_location")
-
-                            # 将下一个段落文本置为空，表示已被合并
-                            next_para["para_text"] = ""
-
-                            # 更新 merge_next_para 标记
-                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
-
-        return pdf_dict
--- a/magic_pdf/para/block_termination_processor.py
+++ b/magic_pdf/para/block_termination_processor.py
-from magic_pdf.para.commons import *
-
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-
-class BlockTerminationProcessor:
-    def __init__(self) -> None:
-        pass
-
-    def _is_consistent_lines(
-        self,
-        curr_line,
-        prev_line,
-        next_line,
-        consistent_direction,  # 0 for prev, 1 for next, 2 for both
-    ):
-        """
-        This function checks if the line is consistent with its neighbors
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        consistent_direction : int
-            0 for prev, 1 for next, 2 for both
-
-        Returns
-        -------
-        bool
-            True if the line is consistent with its neighbors, False otherwise.
-        """
-
-        curr_line_font_size = curr_line["spans"][0]["size"]
-        curr_line_font_type = curr_line["spans"][0]["font"].lower()
-
-        if consistent_direction == 0:
-            if prev_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
-            else:
-                return False
-
-        elif consistent_direction == 1:
-            if next_line:
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-            else:
-                return False
-
-        elif consistent_direction == 2:
-            if prev_line and next_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
-                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-                )
-            else:
-                return False
-
-        else:
-            return False
-
-    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
-        """
-        This function checks if the line is a regular line
-
-        Parameters
-        ----------
-        curr_line_bbox : list
-            bbox of the current line
-        prev_line_bbox : list
-            bbox of the previous line
-        next_line_bbox : list
-            bbox of the next line
-        avg_char_width : float
-            average of char widths
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_line_height : float
-            average of line heights
-
-        Returns
-        -------
-        bool
-            True if the line is a regular line, False otherwise.
-        """
-        horizontal_ratio = 0.5
-        vertical_ratio = 0.5
-        horizontal_thres = horizontal_ratio * avg_char_width
-        vertical_thres = vertical_ratio * avg_line_height
-
-        x0, y0, x1, y1 = curr_line_bbox
-
-        x0_near_X0 = abs(x0 - X0) < horizontal_thres
-        x1_near_X1 = abs(x1 - X1) < horizontal_thres
-
-        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
-
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-
-        return (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (not x0_near_X0 and not x1_near_X1)
-            or prev_line_is_end_of_para
-        )
-
-    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
-        """
-        This function checks if the line is a possible start of a paragraph
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        avg_line_height : float
-            average of line heights
-
-        Returns
-        -------
-        bool
-            True if the line is a possible start of a paragraph, False otherwise.
-        """
-        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
-        decision_path = []  # Record the decision path
-
-        curr_line_bbox = curr_line["bbox"]
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        next_line_bbox = next_line["bbox"] if next_line else None
-
-        indent_ratio = 1
-
-        vertical_ratio = 1.5
-        vertical_thres = vertical_ratio * avg_font_size
-
-        left_horizontal_ratio = 0.5
-        left_horizontal_thres = left_horizontal_ratio * avg_char_width
-
-        right_horizontal_ratio = 2.5
-        right_horizontal_thres = right_horizontal_ratio * avg_char_width
-
-        x0, y0, x1, y1 = curr_line_bbox
-
-        indent_condition = x0 > X0 + indent_ratio * avg_char_width
-        if indent_condition:
-            start_confidence += 0.2
-            decision_path.append("indent_condition_met")
-
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
-        if x0_near_X0:
-            start_confidence += 0.1
-            decision_path.append("x0_near_X0")
-
-        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
-        if x1_near_X1:
-            start_confidence += 0.1
-            decision_path.append("x1_near_X1")
-
-        if prev_line is None:
-            prev_line_is_end_of_para = True
-            start_confidence += 0.2
-            decision_path.append("no_prev_line")
-        else:
-            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
-            if prev_line_is_end_of_para:
-                start_confidence += 0.1
-                decision_path.append("prev_line_is_end_of_para")
-
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            if sufficient_spacing_above:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_above")
-
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            if sufficient_spacing_below:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_below")
-
-        is_regular_line = self._is_regular_line(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
-        )
-        if is_regular_line:
-            start_confidence += 0.1
-            decision_path.append("is_regular_line")
-
-        is_start_of_para = (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (indent_condition)
-            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
-            or prev_line_is_end_of_para
-        )
-        return (is_start_of_para, start_confidence, decision_path)
-
-    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
-        """
-        This function checks if the line is a possible end of a paragraph
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-
-        Returns
-        -------
-        bool
-            True if the line is a possible end of a paragraph, False otherwise.
-        """
-
-        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
-        decision_path = []  # Record the decision path
-
-        curr_line_bbox = curr_line["bbox"]
-        next_line_bbox = next_line["bbox"] if next_line else None
-
-        left_horizontal_ratio = 0.5
-        right_horizontal_ratio = 0.5
-
-        x0, _, x1, y1 = curr_line_bbox
-        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
-        if x0_near_X0:
-            end_confidence += 0.1
-            decision_path.append("x0_near_X0")
-
-        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
-        if x1_smaller_than_X1:
-            end_confidence += 0.1
-            decision_path.append("x1_smaller_than_X1")
-
-        next_line_is_start_of_para = (
-            next_line_bbox
-            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
-            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
-        )
-        if next_line_is_start_of_para:
-            end_confidence += 0.2
-            decision_path.append("next_line_is_start_of_para")
-
-        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if is_line_left_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_left_aligned_from_neighbors")
-
-        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if not is_line_right_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_not_right_aligned_from_neighbors")
-
-        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
-            (x0_near_X0 and x1_smaller_than_X1)
-            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
-        )
-
-        return (is_end_of_para, end_confidence, decision_path)
-
-    def _cut_paras_per_block(
-        self,
-        block,
-    ):
-        """
-        Processes a raw block from PyMuPDF and returns the processed block.
-
-        Parameters
-        ----------
-        raw_block : dict
-            A raw block from pymupdf.
-
-        Returns
-        -------
-        processed_block : dict
-
-        """
-
-        def _construct_para(lines, is_block_title, para_title_level):
-            """
-            Construct a paragraph from given lines.
-            """
-
-            font_sizes = [span["size"] for line in lines for span in line["spans"]]
-            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
-
-            font_colors = [span["color"] for line in lines for span in line["spans"]]
-            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
-
-            # font_types = [span["font"] for line in lines for span in line["spans"]]
-            # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
-
-            font_type_lengths = {}
-            for line in lines:
-                for span in line["spans"]:
-                    font_type = span["font"]
-                    bbox_width = span["bbox"][2] - span["bbox"][0]
-                    if font_type in font_type_lengths:
-                        font_type_lengths[font_type] += bbox_width
-                    else:
-                        font_type_lengths[font_type] = bbox_width
-
-            # get the font type with the longest bbox width
-            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
-
-            para_bbox = calculate_para_bbox(lines)
-            para_text = " ".join(line["text"] for line in lines)
-
-            return {
-                "para_bbox": para_bbox,
-                "para_text": para_text,
-                "para_font_type": most_common_font_type,
-                "para_font_size": avg_font_size,
-                "para_font_color": most_common_font_color,
-                "is_para_title": is_block_title,
-                "para_title_level": para_title_level,
-            }
-
-        block_bbox = block["bbox"]
-        block_text = block["text"]
-        block_lines = block["lines"]
-
-        X0 = safe_get(block, "X0", 0)
-        X1 = safe_get(block, "X1", 0)
-        avg_char_width = safe_get(block, "avg_char_width", 0)
-        avg_char_height = safe_get(block, "avg_char_height", 0)
-        avg_font_size = safe_get(block, "avg_font_size", 0)
-
-        is_block_title = safe_get(block, "is_block_title", False)
-        para_title_level = safe_get(block, "block_title_level", 0)
-
-        # Segment into paragraphs
-        para_ranges = []
-        in_paragraph = False
-        start_idx_of_para = None
-
-        # Create the processed paragraphs
-        processed_paras = {}
-        para_bboxes = []
-        end_idx_of_para = 0
-
-        for line_index, line in enumerate(block_lines):
-            curr_line = line
-            prev_line = block_lines[line_index - 1] if line_index > 0 else None
-            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
-
-            """
-            Start processing paragraphs.
-            """
-
-            # Check if the line is the start of a paragraph
-            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
-                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
-            )
-            if not in_paragraph and is_start_of_para:
-                in_paragraph = True
-                start_idx_of_para = line_index
-
-                # print_green(">>> Start of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    start_confidence: ", start_confidence)
-                # print("    decision_path: ", decision_path)
-
-            # Check if the line is the end of a paragraph
-            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
-                curr_line, next_line, X0, X1, avg_char_width
-            )
-            if in_paragraph and (is_end_of_para or not next_line):
-                para_ranges.append((start_idx_of_para, line_index))
-                start_idx_of_para = None
-                in_paragraph = False
-
-                # print_red(">>> End of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    end_confidence: ", end_confidence)
-                # print("    decision_path: ", decision_path)
-
-        # Add the last paragraph if it is not added
-        if in_paragraph and start_idx_of_para is not None:
-            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
-
-        # Process the matched paragraphs
-        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
-            matched_lines = block_lines[start_idx : end_idx + 1]
-            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
-            para_key = f"para_{len(processed_paras)}"
-            processed_paras[para_key] = para_properties
-            para_bboxes.append(para_properties["para_bbox"])
-            end_idx_of_para = end_idx + 1
-
-        # Deal with the remaining lines
-        if end_idx_of_para < len(block_lines):
-            unmatched_lines = block_lines[end_idx_of_para:]
-            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
-            unmatched_key = f"para_{len(processed_paras)}"
-            processed_paras[unmatched_key] = unmatched_properties
-            para_bboxes.append(unmatched_properties["para_bbox"])
-
-        block["paras"] = processed_paras
-
-        return block
-
-    def batch_process_blocks(self, pdf_dict):
-        """
-        Parses the blocks of all pages.
-
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-        filter_blocks : list
-            List of bounding boxes to filter.
-
-        Returns
-        -------
-        result_dict : dict
-            Result dictionary.
-
-        """
-
-        num_paras = 0
-
-        for page_id, page in pdf_dict.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in page.keys():
-                    input_blocks = page["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self._cut_paras_per_block(input_block)
-                        para_blocks.append(new_block)
-                        num_paras += len(new_block["paras"])
-
-                page["para_blocks"] = para_blocks
-
-        pdf_dict["statistics"]["num_paras"] = num_paras
-        return pdf_dict
--- a/magic_pdf/para/commons.py
+++ b/magic_pdf/para/commons.py
-import sys
-
-from magic_pdf.libs.commons import fitz
-from termcolor import cprint
-
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-def open_pdf(pdf_path):
-    try:
-        pdf_document = fitz.open(pdf_path)  # type: ignore
-        return pdf_document
-    except Exception as e:
-        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
-        raise e
-
-
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-
-
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-
-
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-
-
-def print_yellow(text):
-    print()
-    cprint(text, "yellow", attrs=["bold"], end="\n\n")
-
-
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-
-
-def is_bbox_overlap(bbox1, bbox2):
-    """
-    This function checks if bbox1 and bbox2 overlap or not
-
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-
-    Returns
-    -------
-    bool
-        True if bbox1 and bbox2 overlap, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-
-    if x0_1 > x1_2 or x0_2 > x1_1:
-        return False
-    if y0_1 > y1_2 or y0_2 > y1_1:
-        return False
-
-    return True
-
-
-def is_in_bbox(bbox1, bbox2):
-    """
-    This function checks if bbox1 is in bbox2
-
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-
-    Returns
-    -------
-    bool
-        True if bbox1 is in bbox2, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-
-    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
-        return True
-    else:
-        return False
-
-
-def calculate_para_bbox(lines):
-    """
-    This function calculates the minimum bbox of the paragraph
-
-    Parameters
-    ----------
-    lines : list
-        lines
-
-    Returns
-    -------
-    para_bbox : list
-        bbox of the paragraph
-    """
-    x0 = min(line["bbox"][0] for line in lines)
-    y0 = min(line["bbox"][1] for line in lines)
-    x1 = max(line["bbox"][2] for line in lines)
-    y1 = max(line["bbox"][3] for line in lines)
-    return [x0, y0, x1, y1]
-
-
-def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is right aligned from its neighbors
-
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-
-    Returns
-    -------
-    bool
-        True if the line is right aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-
-    _, _, x1, _ = curr_line_bbox
-    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-
-    if direction == 0:
-        return abs(x1 - prev_x1) < horizontal_thres
-    elif direction == 1:
-        return abs(x1 - next_x1) < horizontal_thres
-    elif direction == 2:
-        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
-    else:
-        return False
-
-
-def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is left aligned from its neighbors
-
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-
-    Returns
-    -------
-    bool
-        True if the line is left aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-
-    x0, _, _, _ = curr_line_bbox
-    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-
-    if direction == 0:
-        return abs(x0 - prev_x0) < horizontal_thres
-    elif direction == 1:
-        return abs(x0 - next_x0) < horizontal_thres
-    elif direction == 2:
-        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
-    else:
-        return False
-
-
-def end_with_punctuation(line_text):
-    """
-    This function checks if the line ends with punctuation marks
-    """
-
-    english_end_puncs = [".", "?", "!"]
-    chinese_end_puncs = ["。", "？", "！"]
-    end_puncs = english_end_puncs + chinese_end_puncs
-
-    last_non_space_char = None
-    for ch in line_text[::-1]:
-        if not ch.isspace():
-            last_non_space_char = ch
-            break
-
-    if last_non_space_char is None:
-        return False
-
-    return last_non_space_char in end_puncs
-
-
-def is_nested_list(lst):
-    if isinstance(lst, list):
-        return any(isinstance(sub, list) for sub in lst)
-    return False
--- a/magic_pdf/para/denoise.py
+++ b/magic_pdf/para/denoise.py
-import math
-
-from collections import defaultdict
-from magic_pdf.para.commons import *
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-class HeaderFooterProcessor:
-    def __init__(self) -> None:
-        pass
-
-    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
-        """
-        This function gets the most common bboxes from the bboxes
-
-        Parameters
-        ----------
-        bboxes : list
-            bboxes
-        page_height : float
-            height of the page
-        position : str, optional
-            "top" or "bottom", by default "top"
-        threshold : float, optional
-            threshold, by default 0.25
-        num_bboxes : int, optional
-            number of bboxes to return, by default 3
-        min_frequency : int, optional
-            minimum frequency of the bbox, by default 2
-
-        Returns
-        -------
-        common_bboxes : list
-            common bboxes
-        """
-        # Filter bbox by position
-        if position == "top":
-            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
-        else:
-            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
-
-        # Find the most common bbox
-        bbox_count = defaultdict(int)
-        for bbox in filtered_bboxes:
-            bbox_count[tuple(bbox)] += 1
-
-        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
-        common_bboxes = [
-            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
-        ][:num_bboxes]
-        return common_bboxes
-
-    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
-        """
-        This function detects the header and footer of the document.
-
-        Parameters
-        ----------
-        result_dict : dict
-            result dictionary
-
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-
-        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
-            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
-
-        def is_single_line_block(block):
-            # Determine based on the width and height of the block
-            block_width = block["X1"] - block["X0"]
-            block_height = block["bbox"][3] - block["bbox"][1]
-
-            # If the height of the block is close to the average character height and the width is large, it is considered a single line
-            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
-
-        # Traverse all blocks in the document
-        single_preproc_blocks = 0
-        total_blocks = 0
-        single_preproc_blocks = 0
-
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        total_blocks += 1
-                        if is_single_line_block(block):
-                            single_preproc_blocks += 1
-
-        # If there are no blocks, skip the header and footer detection
-        if total_blocks == 0:
-            print("No blocks found. Skipping header/footer detection.")
-            return result_dict
-
-        # If most of the blocks are single-line, skip the header and footer detection
-        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
-            return result_dict
-
-        # Collect the bounding boxes of all blocks
-        all_bboxes = []
-        all_texts = []
-
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        all_bboxes.append(block["bbox"])
-
-        # Get the height of the page
-        page_height = max(bbox[3] for bbox in all_bboxes)
-
-        # Get the most common bbox lists for headers and footers
-        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
-        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
-
-        # Detect and mark headers and footers
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        bbox = block["bbox"]
-                        text = block["text"]
-
-                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
-                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
-
-                        block["is_header"] = int(is_header)
-                        block["is_footer"] = int(is_footer)
-
-        return result_dict
-
-
-class NonHorizontalTextProcessor:
-    def __init__(self) -> None:
-        pass
-
-    def detect_non_horizontal_texts(self, result_dict):
-        """
-        This function detects watermarks and vertical margin notes in the document.
-
-        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
-        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
-
-        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
-        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
-
-
-        Parameters
-        ----------
-        result_dict : dict
-            The result dictionary.
-
-        Returns
-        -------
-        result_dict : dict
-            The updated result dictionary.
-        """
-        # Dictionary to store information about potential watermarks
-        potential_watermarks = {}
-        potential_margin_notes = {}
-
-        for page_id, page_content in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_id, block_data in page_content.items():
-                    if block_id.startswith("block_"):
-                        if "dir" in block_data:
-                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
-
-                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
-                            angle = abs(math.degrees(angle))
-
-                            if angle > 5 and angle < 85:  # Check if direction is watermarks
-                                if coordinates_text in potential_watermarks:
-                                    potential_watermarks[coordinates_text] += 1
-                                else:
-                                    potential_watermarks[coordinates_text] = 1
-
-                            if angle > 85 and angle < 105:  # Check if direction is vertical
-                                if coordinates_text in potential_margin_notes:
-                                    potential_margin_notes[coordinates_text] += 1  # Increment count
-                                else:
-                                    potential_margin_notes[coordinates_text] = 1  # Initialize count
-
-        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-        watermark_threshold = len(result_dict) // 2
-        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
-
-        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-        margin_note_threshold = len(result_dict) // 2
-        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
-
-        # Add watermark information to the result dictionary
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_id, block_data in blocks.items():
-                    coordinates_text = (block_data["bbox"], block_data["text"])
-                    if coordinates_text in watermarks:
-                        block_data["is_watermark"] = 1
-                    else:
-                        block_data["is_watermark"] = 0
-
-                    if coordinates_text in margin_notes:
-                        block_data["is_vertical_margin_note"] = 1
-                    else:
-                        block_data["is_vertical_margin_note"] = 0
-
-        return result_dict
-
-
-class NoiseRemover:
-    def __init__(self) -> None:
-        pass
-
-    def skip_data_noises(self, result_dict):
-        """
-        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
-        """
-        filtered_result_dict = {}
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                filtered_blocks = {}
-                for block_id, block in blocks.items():
-                    if block_id.startswith("block_"):
-                        if any(
-                            block.get(key, 0)
-                            for key in [
-                                "is_overlap",
-                                "is_header",
-                                "is_footer",
-                                "is_watermark",
-                                "is_vertical_margin_note",
-                                "is_block_title",
-                            ]
-                        ):
-                            continue
-                        filtered_blocks[block_id] = block
-                if filtered_blocks:
-                    filtered_result_dict[page_id] = filtered_blocks
-
-        return filtered_result_dict
--- a/magic_pdf/para/draw.py
+++ b/magic_pdf/para/draw.py
-from magic_pdf.libs.commons import fitz
-
-from magic_pdf.para.commons import *
-
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-class DrawAnnos:
-    """
-    This class draws annotations on the pdf file
-
-    ----------------------------------------
-                Color Code
-    ----------------------------------------
-        Red: (1, 0, 0)
-        Green: (0, 1, 0)
-        Blue: (0, 0, 1)
-        Yellow: (1, 1, 0) - mix of red and green
-        Cyan: (0, 1, 1) - mix of green and blue
-        Magenta: (1, 0, 1) - mix of red and blue
-        White: (1, 1, 1) - red, green and blue full intensity
-        Black: (0, 0, 0) - no color component whatsoever
-        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
-        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __is_nested_list(self, lst):
-        """
-        This function returns True if the given list is a nested list of any degree.
-        """
-        if isinstance(lst, list):
-            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
-        return False
-
-    def __valid_rect(self, bbox):
-        # Ensure that the rectangle is not empty or invalid
-        if isinstance(bbox[0], list):
-            return False  # It's a nested list, hence it can't be valid rect
-        else:
-            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
-
-    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
-        """
-        This function draws the nested boxes
-
-        Parameters
-        ----------
-        page : fitz.Page
-            page
-        nested_bbox : list
-            nested bbox
-        color : tuple
-            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
-        """
-        if self.__is_nested_list(nested_bbox):  # If it's a nested list
-            for bbox in nested_bbox:
-                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
-        elif self.__valid_rect(nested_bbox):  # If valid rectangle
-            para_rect = fitz.Rect(nested_bbox)
-            para_anno = page.add_rect_annot(para_rect)
-            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
-            para_anno.set_border(width=1)
-            para_anno.update()
-
-    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
-        pdf_doc = open_pdf(input_pdf_path)
-
-        if pdf_dic is None:
-            pdf_dic = {}
-
-        if output_pdf_path is None:
-            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
-
-        for page_id, page in enumerate(pdf_doc):  # type: ignore
-            page_key = f"page_{page_id}"
-            for ele_key, ele_data in pdf_dic[page_key].items():
-                if ele_key == "para_blocks":
-                    para_blocks = ele_data
-                    for para_block in para_blocks:
-                        if "paras" in para_block.keys():
-                            paras = para_block["paras"]
-                            for para_key, para_content in paras.items():
-                                para_bbox = para_content["para_bbox"]
-                                # print(f"para_bbox: {para_bbox}")
-                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
-                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
-                                    color = (0, 1, 1)
-                                    self.__draw_nested_boxes(
-                                        page, para_bbox, color
-                                    )  # draw with cyan color for combined paragraph
-                                else:
-                                    if self.__valid_rect(para_bbox):
-                                        para_rect = fitz.Rect(para_bbox)
-                                        para_anno = page.add_rect_annot(para_rect)
-                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
-                                        para_anno.set_border(width=0.5)
-                                        para_anno.update()
-
-                                is_para_title = para_content["is_para_title"]
-                                if is_para_title:
-                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
-                                        color = (0, 0, 1)
-                                        self.__draw_nested_boxes(
-                                            page, para_content["para_bbox"], color
-                                        )  # draw with cyan color for combined title
-                                    else:
-                                        if self.__valid_rect(para_content["para_bbox"]):
-                                            para_rect = fitz.Rect(para_content["para_bbox"])
-                                            if self.__valid_rect(para_content["para_bbox"]):
-                                                para_anno = page.add_rect_annot(para_rect)
-                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
-                                                para_anno.set_border(width=0.5)
-                                                para_anno.update()
-
-        pdf_doc.save(output_pdf_path)
-        pdf_doc.close()
--- a/magic_pdf/para/exceptions.py
+++ b/magic_pdf/para/exceptions.py
-class DenseSingleLineBlockException(Exception):
-    """
-    This class defines the exception type for dense single line-block.
-    """
-
-    def __init__(self, message="DenseSingleLineBlockException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class TitleDetectionException(Exception):
-    """
-    This class defines the exception type for title detection.
-    """
-
-    def __init__(self, message="TitleDetectionException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class TitleLevelException(Exception):
-    """
-    This class defines the exception type for title level.
-    """
-
-    def __init__(self, message="TitleLevelException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class ParaSplitException(Exception):
-    """
-    This class defines the exception type for paragraph splitting.
-    """
-
-    def __init__(self, message="ParaSplitException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class ParaMergeException(Exception):
-    """
-    This class defines the exception type for paragraph merging.
-    """
-
-    def __init__(self, message="ParaMergeException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class DiscardByException:
-    """
-    This class discards pdf files by exception
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
-        """
-        This function discards pdf files by single line block exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        exception_page_nums = 0
-        page_num = 0
-        for page_id, page in pdf_dic.items():
-            if page_id.startswith("page_"):
-                page_num += 1
-                if "preproc_blocks" in page.keys():
-                    preproc_blocks = page["preproc_blocks"]
-
-                    all_single_line_blocks = []
-                    for block in preproc_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-
-                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
-                        exception_page_nums += 1
-
-        if page_num == 0:
-            return None
-
-        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
-            return exception.message
-
-        return None
-
-    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
-        """
-        This function discards pdf files by title detection exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-
-    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
-        """
-        This function discards pdf files by title level exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-
-    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
-        """
-        This function discards pdf files by split para exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-
-    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
-        """
-        This function discards pdf files by merge para exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
--- a/magic_pdf/para/layout_match_processor.py
+++ b/magic_pdf/para/layout_match_processor.py
-import math
-from magic_pdf.para.commons import *
-
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-class LayoutFilterProcessor:
-    def __init__(self) -> None:
-        pass
-
-    def batch_process_blocks(self, pdf_dict):
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
-                    layout_bbox_objs = blocks["layout_bboxes"]
-                    if layout_bbox_objs is None:
-                        continue
-                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
-
-                    # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
-                    layout_bboxes = [
-                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
-                    ]
-
-                    para_blocks = blocks["para_blocks"]
-                    if para_blocks is None:
-                        continue
-
-                    for lb_bbox in layout_bboxes:
-                        for i, para_block in enumerate(para_blocks):
-                            para_bbox = para_block["bbox"]
-                            para_blocks[i]["in_layout"] = 0
-                            if is_in_bbox(para_bbox, lb_bbox):
-                                para_blocks[i]["in_layout"] = 1
-
-                    blocks["para_blocks"] = para_blocks
-
-        return pdf_dict
--- a/magic_pdf/para/para_split.py
+++ b/magic_pdf/para/para_split.py
-import numpy as np
-from loguru import logger
-from sklearn.cluster import DBSCAN
-
-from magic_pdf.config.ocr_content_type import ContentType
-from magic_pdf.libs.boxbase import \
-    _is_in_or_part_overlap_with_area_ratio as is_in_layout
-
-LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？', '：', ':', ')', '）', ';']
-INLINE_EQUATION = ContentType.InlineEquation
-INTERLINE_EQUATION = ContentType.InterlineEquation
-TEXT = ContentType.Text
-
-
-def __get_span_text(span):
-    c = span.get('content', '')
-    if len(c) == 0:
-        c = span.get('image_path', '')
-
-    return c
-
-
-def __detect_list_lines(lines, new_layout_bboxes, lang):
-    """探测是否包含了列表，并且把列表的行分开.
-
-    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
-    """
-
-    def find_repeating_patterns(lst):
-        indices = []
-        ones_indices = []
-        i = 0
-        while i < len(lst) - 1:  # 确保余下元素至少有2个
-            if lst[i] == 1 and lst[i + 1] in [2, 3]:  # 额外检查以防止连续出现的1
-                start = i
-                ones_in_this_interval = [i]
-                i += 1
-                while i < len(lst) and lst[i] in [2, 3]:
-                    i += 1
-                # 验证下一个序列是否符合条件
-                if (
-                    i < len(lst) - 1
-                    and lst[i] == 1
-                    and lst[i + 1] in [2, 3]
-                    and lst[i - 1] in [2, 3]
-                ):
-                    while i < len(lst) and lst[i] in [1, 2, 3]:
-                        if lst[i] == 1:
-                            ones_in_this_interval.append(i)
-                        i += 1
-                    indices.append((start, i - 1))
-                    ones_indices.append(ones_in_this_interval)
-                else:
-                    i += 1
-            else:
-                i += 1
-        return indices, ones_indices
-
-    """===================="""
-
-    def split_indices(slen, index_array):
-        result = []
-        last_end = 0
-
-        for start, end in sorted(index_array):
-            if start > last_end:
-                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
-                result.append(('text', last_end, start - 1))
-            # 区间内标记为"list"
-            result.append(('list', start, end))
-            last_end = end + 1
-
-        if last_end < slen:
-            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
-            result.append(('text', last_end, slen - 1))
-
-        return result
-
-    """===================="""
-
-    if lang != 'en':
-        return lines, None
-    else:
-        total_lines = len(lines)
-        line_fea_encode = []
-        """
-        对每一行进行特征编码，编码规则如下：
-        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-        2. 如果顶格，其他非大写开头编码为4
-        3. 如果非顶格，首字符大写，编码为2
-        4. 如果非顶格，首字符非大写编码为3
-        """
-        for l in lines:  # noqa: E741
-            first_char = __get_span_text(l['spans'][0])[0]
-            layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0]
-            if l['bbox'][0] == layout_left:
-                if first_char.isupper() or first_char.isdigit():
-                    line_fea_encode.append(1)
-                else:
-                    line_fea_encode.append(4)
-            else:
-                if first_char.isupper():
-                    line_fea_encode.append(2)
-                else:
-                    line_fea_encode.append(3)
-
-        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-
-        list_indice, list_start_idx = find_repeating_patterns(line_fea_encode)
-        if len(list_indice) > 0:
-            logger.info(f'发现了列表，列表行数：{list_indice}， {list_start_idx}')
-
-        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-
-        for start, end in list_indice:
-            for i in range(start, end + 1):
-                if i > 0:
-                    if line_fea_encode[i] == 4:
-                        logger.info(f'列表行的第{i}行不是顶格的')
-                        break
-            else:
-                logger.info(f'列表行的第{start}到第{end}行是列表')
-
-        return split_indices(total_lines, list_indice), list_start_idx
-
-
-def __valign_lines(blocks, layout_bboxes):
-    """在一个layoutbox内对齐行的左侧和右侧。 扫描行的左侧和右侧，如果x0,
-    x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
-    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。"""
-
-    min_distance = 3
-    min_sample = 2
-    new_layout_bboxes = []
-
-    for layout_box in layout_bboxes:
-        blocks_in_layoutbox = [
-            b for b in blocks if is_in_layout(b['bbox'], layout_box['layout_bbox'])
-        ]
-        if len(blocks_in_layoutbox) == 0:
-            continue
-
-        x0_lst = np.array(
-            [
-                [line['bbox'][0], 0]
-                for block in blocks_in_layoutbox
-                for line in block['lines']
-            ]
-        )
-        x1_lst = np.array(
-            [
-                [line['bbox'][2], 0]
-                for block in blocks_in_layoutbox
-                for line in block['lines']
-            ]
-        )
-        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
-        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
-        x0_uniq_label = np.unique(x0_clusters.labels_)
-        x1_uniq_label = np.unique(x1_clusters.labels_)
-
-        x0_2_new_val = {}  # 存储旧值对应的新值映射
-        x1_2_new_val = {}
-        for label in x0_uniq_label:
-            if label == -1:
-                continue
-            x0_index_of_label = np.where(x0_clusters.labels_ == label)
-            x0_raw_val = x0_lst[x0_index_of_label][:, 0]
-            x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
-            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
-        for label in x1_uniq_label:
-            if label == -1:
-                continue
-            x1_index_of_label = np.where(x1_clusters.labels_ == label)
-            x1_raw_val = x1_lst[x1_index_of_label][:, 0]
-            x1_new_val = np.max(x1_lst[x1_index_of_label][:, 0])
-            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
-
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 in x0_2_new_val:
-                    line['bbox'][0] = int(x0_2_new_val[x0])
-
-                if x1 in x1_2_new_val:
-                    line['bbox'][2] = int(x1_2_new_val[x1])
-            # 其余对不齐的保持不动
-
-        # 由于修改了block里的line长度，现在需要重新计算block的bbox
-        for block in blocks_in_layoutbox:
-            block['bbox'] = [
-                min([line['bbox'][0] for line in block['lines']]),
-                min([line['bbox'][1] for line in block['lines']]),
-                max([line['bbox'][2] for line in block['lines']]),
-                max([line['bbox'][3] for line in block['lines']]),
-            ]
-
-        """新计算layout的bbox，因为block的bbox变了。"""
-        layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
-        layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
-        layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
-        layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
-        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
-
-    return new_layout_bboxes
-
-
-def __align_text_in_layout(blocks, layout_bboxes):
-    """由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。"""
-    for layout in layout_bboxes:
-        lb = layout['layout_bbox']
-        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
-        if len(blocks_in_layoutbox) == 0:
-            continue
-
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 < lb[0]:
-                    line['bbox'][0] = lb[0]
-                if x1 > lb[2]:
-                    line['bbox'][2] = lb[2]
-
-
-def __common_pre_proc(blocks, layout_bboxes):
-    """不分语言的，对文本进行预处理."""
-    # __add_line_period(blocks, layout_bboxes)
-    __align_text_in_layout(blocks, layout_bboxes)
-    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
-
-    return aligned_layout_bboxes
-
-
-def __pre_proc_zh_blocks(blocks, layout_bboxes):
-    """对中文文本进行分段预处理."""
-    pass
-
-
-def __pre_proc_en_blocks(blocks, layout_bboxes):
-    """对英文文本进行分段预处理."""
-    pass
-
-
-def __group_line_by_layout(blocks, layout_bboxes, lang='en'):
-    """每个layout内的行进行聚合."""
-    # 因为只是一个block一行目前, 一个block就是一个段落
-    lines_group = []
-
-    for lyout in layout_bboxes:
-        lines = [
-            line
-            for block in blocks
-            if is_in_layout(block['bbox'], lyout['layout_bbox'])
-            for line in block['lines']
-        ]
-        lines_group.append(lines)
-
-    return lines_group
-
-
-def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang='en', char_avg_len=10):
-    """
-    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
-    1. 先计算每个group的左右边界。
-    2. 然后根据行末尾特征进行分段。
-        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
-        且下一行开头不留空白。
-
-    """
-    list_info = []  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-    layout_paras = []
-    right_tail_distance = 1.5 * char_avg_len
-
-    for lines in lines_group:
-        paras = []
-        total_lines = len(lines)
-        if total_lines == 0:
-            continue  # 0行无需处理
-        if total_lines == 1:  # 1行无法分段。
-            layout_paras.append([lines])
-            list_info.append([False, False])
-            continue
-
-        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
-            对齐方式分为以下：
-            1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
-                1) 右侧对齐的行，单独成一段
-                2) 中间对齐的行，按照字体/行高聚合成一段
-            2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
-                这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
-        """
-
-        text_segments, list_start_line = __detect_list_lines(
-            lines, new_layout_bbox, lang
-        )
-        """根据list_range，把lines分成几个部分
-
-        """
-
-        layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
-        layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
-        para = []  # 元素是line
-        layout_list_info = [
-            False,
-            False,
-        ]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-        for content_type, start, end in text_segments:
-            if content_type == 'list':
-                for i, line in enumerate(lines[start : end + 1]):
-                    line_x0 = line['bbox'][0]
-                    if line_x0 == layout_left:  # 列表开头
-                        if len(para) > 0:
-                            paras.append(para)
-                            para = []
-                        para.append(line)
-                    else:
-                        para.append(line)
-                if len(para) > 0:
-                    paras.append(para)
-                    para = []
-                if start == 0:
-                    layout_list_info[0] = True
-                if end == total_lines - 1:
-                    layout_list_info[1] = True
-            else:  # 是普通文本
-                for i, line in enumerate(lines[start : end + 1]):
-                    # 如果i有下一行，那么就要根据下一行位置综合判断是否要分段。如果i之后没有行，那么只需要判断i行自己的结尾特征。
-                    cur_line_type = line['spans'][-1]['type']
-                    next_line = lines[i + 1] if i < total_lines - 1 else None
-
-                    if cur_line_type in [TEXT, INLINE_EQUATION]:
-                        if line['bbox'][2] < layout_right - right_tail_distance:
-                            para.append(line)
-                            paras.append(para)
-                            para = []
-                        elif (
-                            line['bbox'][2] >= layout_right - right_tail_distance
-                            and next_line
-                            and next_line['bbox'][0] == layout_left
-                        ):  # 现在这行到了行尾沾满，下一行存在且顶格。
-                            para.append(line)
-                        else:
-                            para.append(line)
-                            paras.append(para)
-                            para = []
-                    else:  # 其他，图片、表格、行间公式，各自占一段
-                        if len(para) > 0:  # 先把之前的段落加入到结果中
-                            paras.append(para)
-                            para = []
-                        paras.append(
-                            [line]
-                        )  # 再把当前行加入到结果中。当前行为行间公式、图、表等。
-                        para = []
-
-                if len(para) > 0:
-                    paras.append(para)
-                    para = []
-
-        list_info.append(layout_list_info)
-        layout_paras.append(paras)
-        paras = []
-
-    return layout_paras, list_info
-
-
-def __connect_list_inter_layout(
-    layout_paras, new_layout_bbox, layout_list_info, page_num, lang
-):
-    """如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO
-    因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。"""
-    if (
-        len(layout_paras) == 0 or len(layout_list_info) == 0
-    ):  # 0的时候最后的return 会出错
-        return layout_paras, [False, False]
-
-    for i in range(1, len(layout_paras)):
-        pre_layout_list_info = layout_list_info[i - 1]
-        next_layout_list_info = layout_list_info[i]
-        pre_last_para = layout_paras[i - 1][-1]
-        next_paras = layout_paras[i]
-
-        if (
-            pre_layout_list_info[1] and not next_layout_list_info[0]
-        ):  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-            logger.info(f'连接page {page_num} 内的list')
-            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-            may_list_lines = []
-            for j in range(len(next_paras)):
-                line = next_paras[j]
-                if len(line) == 1:  # 只可能是一行，多行情况再需要分析了
-                    if (
-                        line[0]['bbox'][0]
-                        > __find_layout_bbox_by_line(line[0]['bbox'], new_layout_bbox)[
-                            0
-                        ]
-                    ):
-                        may_list_lines.append(line[0])
-                    else:
-                        break
-                else:
-                    break
-            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-            if (
-                len(may_list_lines) > 0
-                and len(set([x['bbox'][0] for x in may_list_lines])) == 1
-            ):
-                pre_last_para.extend(may_list_lines)
-                layout_paras[i] = layout_paras[i][len(may_list_lines) :]
-
-    return layout_paras, [
-        layout_list_info[0][0],
-        layout_list_info[-1][1],
-    ]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
-
-
-def __connect_list_inter_page(
-    pre_page_paras,
-    next_page_paras,
-    pre_page_layout_bbox,
-    next_page_layout_bbox,
-    pre_page_list_info,
-    next_page_list_info,
-    page_num,
-    lang,
-):
-    """如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO
-    因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。"""
-    if (
-        len(pre_page_paras) == 0 or len(next_page_paras) == 0
-    ):  # 0的时候最后的return 会出错
-        return False
-
-    if (
-        pre_page_list_info[1] and not next_page_list_info[0]
-    ):  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-        logger.info(f'连接page {page_num} 内的list')
-        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-        may_list_lines = []
-        for j in range(len(next_page_paras[0])):
-            line = next_page_paras[0][j]
-            if len(line) == 1:  # 只可能是一行，多行情况再需要分析了
-                if (
-                    line[0]['bbox'][0]
-                    > __find_layout_bbox_by_line(
-                        line[0]['bbox'], next_page_layout_bbox
-                    )[0]
-                ):
-                    may_list_lines.append(line[0])
-                else:
-                    break
-            else:
-                break
-        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-        if (
-            len(may_list_lines) > 0
-            and len(set([x['bbox'][0] for x in may_list_lines])) == 1
-        ):
-            pre_page_paras[-1].append(may_list_lines)
-            next_page_paras[0] = next_page_paras[0][len(may_list_lines) :]
-            return True
-
-    return False
-
-
-def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
-    """根据line找到所在的layout."""
-    for layout in layout_bboxes:
-        if is_in_layout(line_bbox, layout):
-            return layout
-    return None
-
-
-def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
-    """
-    layout之间进行分段。
-    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
-    连接的条件需要同时满足：
-    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
-    2. 下一行开头不留空白。
-
-    """
-    connected_layout_paras = []
-    if len(layout_paras) == 0:
-        return connected_layout_paras
-
-    connected_layout_paras.append(layout_paras[0])
-    for i in range(1, len(layout_paras)):
-        try:
-            if (
-                len(layout_paras[i]) == 0 or len(layout_paras[i - 1]) == 0
-            ):  # TODO 考虑连接问题，
-                continue
-            pre_last_line = layout_paras[i - 1][-1][-1]
-            next_first_line = layout_paras[i][0][0]
-        except Exception:
-            logger.error(f'page layout {i} has no line')
-            continue
-        pre_last_line_text = ''.join(
-            [__get_span_text(span) for span in pre_last_line['spans']]
-        )
-        pre_last_line_type = pre_last_line['spans'][-1]['type']
-        next_first_line_text = ''.join(
-            [__get_span_text(span) for span in next_first_line['spans']]
-        )
-        next_first_line_type = next_first_line['spans'][0]['type']
-        if pre_last_line_type not in [
-            TEXT,
-            INLINE_EQUATION,
-        ] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
-            connected_layout_paras.append(layout_paras[i])
-            continue
-
-        pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[
-            2
-        ]
-        next_x0_min = __find_layout_bbox_by_line(
-            next_first_line['bbox'], new_layout_bbox
-        )[0]
-
-        pre_last_line_text = pre_last_line_text.strip()
-        next_first_line_text = next_first_line_text.strip()
-        if (
-            pre_last_line['bbox'][2] == pre_x2_max
-            and pre_last_line_text[-1] not in LINE_STOP_FLAG
-            and next_first_line['bbox'][0] == next_x0_min
-        ):  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-            connected_layout_paras[-1][-1].extend(layout_paras[i][0])
-            layout_paras[i].pop(
-                0
-            )  # 删除后一个layout的第一个段落， 因为他已经被合并到前一个layout的最后一个段落了。
-            if len(layout_paras[i]) == 0:
-                layout_paras.pop(i)
-            else:
-                connected_layout_paras.append(layout_paras[i])
-        else:
-            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
-            connected_layout_paras.append(layout_paras[i])
-
-    return connected_layout_paras
-
-
-def __connect_para_inter_page(
-    pre_page_paras,
-    next_page_paras,
-    pre_page_layout_bbox,
-    next_page_layout_bbox,
-    page_num,
-    lang,
-):
-    """
-    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
-    是否可以连接的条件：
-    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
-    2. 后一个页面的第一个段落第一行没有空白开头。
-    """
-    # 有的页面可能压根没有文字
-    if (
-        len(pre_page_paras) == 0
-        or len(next_page_paras) == 0
-        or len(pre_page_paras[0]) == 0
-        or len(next_page_paras[0]) == 0
-    ):  # TODO [[]]为什么出现在pre_page_paras里？
-        return False
-    pre_last_para = pre_page_paras[-1][-1]
-    next_first_para = next_page_paras[0][0]
-    pre_last_line = pre_last_para[-1]
-    next_first_line = next_first_para[0]
-    pre_last_line_text = ''.join(
-        [__get_span_text(span) for span in pre_last_line['spans']]
-    )
-    pre_last_line_type = pre_last_line['spans'][-1]['type']
-    next_first_line_text = ''.join(
-        [__get_span_text(span) for span in next_first_line['spans']]
-    )
-    next_first_line_type = next_first_line['spans'][0]['type']
-
-    if pre_last_line_type not in [
-        TEXT,
-        INLINE_EQUATION,
-    ] or next_first_line_type not in [
-        TEXT,
-        INLINE_EQUATION,
-    ]:  # TODO，真的要做好，要考虑跨table, image, 行间的情况
-        # 不是文本，不连接
-        return False
-
-    pre_x2_max = __find_layout_bbox_by_line(
-        pre_last_line['bbox'], pre_page_layout_bbox
-    )[2]
-    next_x0_min = __find_layout_bbox_by_line(
-        next_first_line['bbox'], next_page_layout_bbox
-    )[0]
-
-    pre_last_line_text = pre_last_line_text.strip()
-    next_first_line_text = next_first_line_text.strip()
-    if (
-        pre_last_line['bbox'][2] == pre_x2_max
-        and pre_last_line_text[-1] not in LINE_STOP_FLAG
-        and next_first_line['bbox'][0] == next_x0_min
-    ):  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-        pre_last_para.extend(next_first_para)
-        next_page_paras[0].pop(
-            0
-        )  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
-        return True
-    else:
-        return False
-
-
-def find_consecutive_true_regions(input_array):
-    start_index = None  # 连续True区域的起始索引
-    regions = []  # 用于保存所有连续True区域的起始和结束索引
-
-    for i in range(len(input_array)):
-        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
-        if input_array[i] and start_index is None:
-            start_index = i  # 记录连续True区域的起始索引
-
-        # 如果我们找到了一个False值，并且当前在连续True区域中
-        elif not input_array[i] and start_index is not None:
-            # 如果连续True区域长度大于1，那么将其添加到结果列表中
-            if i - start_index > 1:
-                regions.append((start_index, i - 1))
-            start_index = None  # 重置起始索引
-
-    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
-    if start_index is not None and len(input_array) - start_index > 1:
-        regions.append((start_index, len(input_array) - 1))
-
-    return regions
-
-
-def __connect_middle_align_text(
-    page_paras, new_layout_bbox, page_num, lang, debug_mode
-):
-    """
-    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
-    一个line居中的条件是：
-    1. 水平中心点跨越layout的中心点。
-    2. 左右两侧都有空白
-    """
-
-    for layout_i, layout_para in enumerate(page_paras):
-        layout_box = new_layout_bbox[layout_i]
-        single_line_paras_tag = []
-        for i in range(len(layout_para)):
-            single_line_paras_tag.append(
-                len(layout_para[i]) == 1
-                and layout_para[i][0]['spans'][0]['type'] == TEXT
-            )
-
-        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
-        consecutive_single_line_indices = find_consecutive_true_regions(
-            single_line_paras_tag
-        )
-        if len(consecutive_single_line_indices) > 0:
-            index_offset = 0
-            """检查这些行是否是高度相同的，居中的"""
-            for start, end in consecutive_single_line_indices:
-                start += index_offset
-                end += index_offset
-                line_hi = np.array(
-                    [
-                        line[0]['bbox'][3] - line[0]['bbox'][1]
-                        for line in layout_para[start : end + 1]
-                    ]
-                )
-                first_line_text = ''.join(
-                    [__get_span_text(span) for span in layout_para[start][0]['spans']]
-                )
-                if 'Table' in first_line_text or 'Figure' in first_line_text:
-                    pass
-                if debug_mode:
-                    logger.debug(line_hi.std())
-
-                if line_hi.std() < 2:
-                    """行高度相同，那么判断是否居中."""
-                    all_left_x0 = [
-                        line[0]['bbox'][0] for line in layout_para[start : end + 1]
-                    ]
-                    all_right_x1 = [
-                        line[0]['bbox'][2] for line in layout_para[start : end + 1]
-                    ]
-                    layout_center = (layout_box[0] + layout_box[2]) / 2
-                    if (
-                        all(
-                            [
-                                x0 < layout_center < x1
-                                for x0, x1 in zip(all_left_x0, all_right_x1)
-                            ]
-                        )
-                        and not all([x0 == layout_box[0] for x0 in all_left_x0])
-                        and not all([x1 == layout_box[2] for x1 in all_right_x1])
-                    ):
-                        merge_para = [l[0] for l in layout_para[start : end + 1]]  # noqa: E741
-                        para_text = ''.join(
-                            [
-                                __get_span_text(span)
-                                for line in merge_para
-                                for span in line['spans']
-                            ]
-                        )
-                        if debug_mode:
-                            logger.debug(para_text)
-                        layout_para[start : end + 1] = [merge_para]
-                        index_offset -= end - start
-
-    return
-
-
-def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
-    """找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。"""
-
-    pass
-
-
-def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
-    """根据line和layout情况进行分段 先实现一个根据行末尾特征分段的简单方法。"""
-    """
-    算法思路：
-    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
-    2. 从上述行中找到末尾是句号等可作为断行标志的行。
-    3. 参照上述行尾特征进行分段。
-    4. 图、表，目前独占一行，不考虑分段。
-    """
-    if page_num == 343:
-        pass
-    lines_group = __group_line_by_layout(blocks, layout_bboxes, lang)  # block内分段
-    layout_paras, layout_list_info = __split_para_in_layoutbox(
-        lines_group, new_layout_bbox, lang
-    )  # layout内分段
-    layout_paras2, page_list_info = __connect_list_inter_layout(
-        layout_paras, new_layout_bbox, layout_list_info, page_num, lang
-    )  # layout之间连接列表段落
-    connected_layout_paras = __connect_para_inter_layoutbox(
-        layout_paras2, new_layout_bbox, lang
-    )  # layout间链接段落
-
-    return connected_layout_paras, page_list_info
-
-
-def para_split(pdf_info_dict, debug_mode, lang='en'):
-    """根据line和layout情况进行分段."""
-    new_layout_of_pages = []  # 数组的数组，每个元素是一个页面的layoutS
-    all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
-    for page_num, page in pdf_info_dict.items():
-        blocks = page['preproc_blocks']
-        layout_bboxes = page['layout_bboxes']
-        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
-        new_layout_of_pages.append(new_layout_bbox)
-        splited_blocks, page_list_info = __do_split_page(
-            blocks, layout_bboxes, new_layout_bbox, page_num, lang
-        )
-        all_page_list_info.append(page_list_info)
-        page['para_blocks'] = splited_blocks
-
-    """连接页面与页面之间的可能合并的段落"""
-    pdf_infos = list(pdf_info_dict.values())
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        if page_num == 0:
-            continue
-        pre_page_paras = pdf_infos[page_num - 1]['para_blocks']
-        next_page_paras = pdf_infos[page_num]['para_blocks']
-        pre_page_layout_bbox = new_layout_of_pages[page_num - 1]
-        next_page_layout_bbox = new_layout_of_pages[page_num]
-
-        is_conn = __connect_para_inter_page(
-            pre_page_paras,
-            next_page_paras,
-            pre_page_layout_bbox,
-            next_page_layout_bbox,
-            page_num,
-            lang,
-        )
-        if debug_mode:
-            if is_conn:
-                logger.info(f'连接了第{page_num-1}页和第{page_num}页的段落')
-
-        is_list_conn = __connect_list_inter_page(
-            pre_page_paras,
-            next_page_paras,
-            pre_page_layout_bbox,
-            next_page_layout_bbox,
-            all_page_list_info[page_num - 1],
-            all_page_list_info[page_num],
-            page_num,
-            lang,
-        )
-        if debug_mode:
-            if is_list_conn:
-                logger.info(f'连接了第{page_num-1}页和第{page_num}页的列表段落')
-
-    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
-    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
-    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
-    """
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        page_paras = page['para_blocks']
-        new_layout_bbox = new_layout_of_pages[page_num]
-        __connect_middle_align_text(
-            page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode
-        )
-        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
--- a/magic_pdf/para/para_split_v2.py
+++ b/magic_pdf/para/para_split_v2.py
-import copy
-import re
-
-import numpy as np
-from loguru import logger
-from sklearn.cluster import DBSCAN
-
-from magic_pdf.config.constants import *  # noqa: F403
-from magic_pdf.config.ocr_content_type import BlockType, ContentType
-from magic_pdf.libs.boxbase import \
-    _is_in_or_part_overlap_with_area_ratio as is_in_layout
-
-LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？', '：', ':', ')', '）', ';']
-INLINE_EQUATION = ContentType.InlineEquation
-INTERLINE_EQUATION = ContentType.InterlineEquation
-TEXT = ContentType.Text
-debug_able = False
-
-
-def __get_span_text(span):
-    c = span.get('content', '')
-    if len(c) == 0:
-        c = span.get('image_path', '')
-
-    return c
-
-
-def __detect_list_lines(lines, new_layout_bboxes, lang):
-    global debug_able
-    """
-    探测是否包含了列表，并且把列表的行分开.
-    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
-    """
-
-    def find_repeating_patterns2(lst):
-        indices = []
-        ones_indices = []
-        i = 0
-        while i < len(lst):  # Loop through the entire list
-            if (
-                lst[i] == 1
-            ):  # If we encounter a '1', we might be at the start of a pattern
-                start = i
-                ones_in_this_interval = [i]
-                i += 1
-                # Traverse elements that are 1, 2 or 3, until we encounter something else
-                while i < len(lst) and lst[i] in [1, 2, 3]:
-                    if lst[i] == 1:
-                        ones_in_this_interval.append(i)
-                    i += 1
-                if len(ones_in_this_interval) > 1 or (
-                    start < len(lst) - 1
-                    and ones_in_this_interval
-                    and lst[start + 1] in [2, 3]
-                ):
-                    indices.append((start, i - 1))
-                    ones_indices.append(ones_in_this_interval)
-            else:
-                i += 1
-        return indices, ones_indices
-
-    def find_repeating_patterns(lst):
-        indices = []
-        ones_indices = []
-        i = 0
-        while i < len(lst) - 1:  # 确保余下元素至少有2个
-            if lst[i] == 1 and lst[i + 1] in [2, 3]:  # 额外检查以防止连续出现的1
-                start = i
-                ones_in_this_interval = [i]
-                i += 1
-                while i < len(lst) and lst[i] in [2, 3]:
-                    i += 1
-                # 验证下一个序列是否符合条件
-                if (
-                    i < len(lst) - 1
-                    and lst[i] == 1
-                    and lst[i + 1] in [2, 3]
-                    and lst[i - 1] in [2, 3]
-                ):
-                    while i < len(lst) and lst[i] in [1, 2, 3]:
-                        if lst[i] == 1:
-                            ones_in_this_interval.append(i)
-                        i += 1
-                    indices.append((start, i - 1))
-                    ones_indices.append(ones_in_this_interval)
-                else:
-                    i += 1
-            else:
-                i += 1
-        return indices, ones_indices
-
-    """===================="""
-
-    def split_indices(slen, index_array):
-        result = []
-        last_end = 0
-
-        for start, end in sorted(index_array):
-            if start > last_end:
-                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
-                result.append(('text', last_end, start - 1))
-            # 区间内标记为"list"
-            result.append(('list', start, end))
-            last_end = end + 1
-
-        if last_end < slen:
-            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
-            result.append(('text', last_end, slen - 1))
-
-        return result
-
-    """===================="""
-
-    if lang != 'en':
-        return lines, None
-
-    total_lines = len(lines)
-    line_fea_encode = []
-    """
-    对每一行进行特征编码，编码规则如下：
-    1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-    2. 如果顶格，其他非大写开头编码为4
-    3. 如果非顶格，首字符大写，编码为2
-    4. 如果非顶格，首字符非大写编码为3
-    """
-    if len(lines) > 0:
-        x_map_tag_dict, min_x_tag = cluster_line_x(lines)
-    for l in lines:  # noqa: E741
-        span_text = __get_span_text(l['spans'][0])
-        if not span_text:
-            line_fea_encode.append(0)
-            continue
-        first_char = span_text[0]
-        layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
-        if not layout:
-            line_fea_encode.append(0)
-        else:
-            #
-            if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
-                # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
-                if not first_char.isalnum() or if_match_reference_list(span_text):
-                    line_fea_encode.append(1)
-                else:
-                    line_fea_encode.append(4)
-            else:
-                if first_char.isupper():
-                    line_fea_encode.append(2)
-                else:
-                    line_fea_encode.append(3)
-
-    # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-
-    list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
-    if len(list_indice) > 0:
-        if debug_able:
-            logger.info(f'发现了列表，列表行数：{list_indice}， {list_start_idx}')
-
-    # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-
-    for start, end in list_indice:
-        for i in range(start, end + 1):
-            if i > 0:
-                if line_fea_encode[i] == 4:
-                    if debug_able:
-                        logger.info(f'列表行的第{i}行不是顶格的')
-                    break
-        else:
-            if debug_able:
-                logger.info(f'列表行的第{start}到第{end}行是列表')
-
-    return split_indices(total_lines, list_indice), list_start_idx
-
-
-def cluster_line_x(lines: list) -> dict:
-    """对一个block内所有lines的bbox的x0聚类."""
-    min_distance = 5
-    min_sample = 1
-    x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
-    x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
-    x0_uniq_label = np.unique(x0_clusters.labels_)
-    # x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
-    x0_2_new_val = {}  # 存储旧值对应的新值映射
-    min_x0 = round(lines[0]['bbox'][0])
-    for label in x0_uniq_label:
-        if label == -1:
-            continue
-        x0_index_of_label = np.where(x0_clusters.labels_ == label)
-        x0_raw_val = x0_lst[x0_index_of_label][:, 0]
-        x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
-        x0_2_new_val.update(
-            {round(raw_val): round(x0_new_val) for raw_val in x0_raw_val}
-        )
-        if x0_new_val < min_x0:
-            min_x0 = x0_new_val
-    return x0_2_new_val, min_x0
-
-
-def if_match_reference_list(text: str) -> bool:
-    pattern = re.compile(r'^\d+\..*')
-    if pattern.match(text):
-        return True
-    else:
-        return False
-
-
-def __valign_lines(blocks, layout_bboxes):
-    """在一个layoutbox内对齐行的左侧和右侧。 扫描行的左侧和右侧，如果x0,
-    x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
-    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。"""
-
-    min_distance = 3
-    min_sample = 2
-    new_layout_bboxes = []
-    # add bbox_fs for para split calculation
-    for block in blocks:
-        block['bbox_fs'] = copy.deepcopy(block['bbox'])
-    for layout_box in layout_bboxes:
-        blocks_in_layoutbox = [
-            b
-            for b in blocks
-            if b['type'] == BlockType.Text
-            and is_in_layout(b['bbox'], layout_box['layout_bbox'])
-        ]
-        if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]['lines']) == 0:
-            new_layout_bboxes.append(layout_box['layout_bbox'])
-            continue
-
-        x0_lst = np.array(
-            [
-                [line['bbox'][0], 0]
-                for block in blocks_in_layoutbox
-                for line in block['lines']
-            ]
-        )
-        x1_lst = np.array(
-            [
-                [line['bbox'][2], 0]
-                for block in blocks_in_layoutbox
-                for line in block['lines']
-            ]
-        )
-        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
-        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
-        x0_uniq_label = np.unique(x0_clusters.labels_)
-        x1_uniq_label = np.unique(x1_clusters.labels_)
-
-        x0_2_new_val = {}  # 存储旧值对应的新值映射
-        x1_2_new_val = {}
-        for label in x0_uniq_label:
-            if label == -1:
-                continue
-            x0_index_of_label = np.where(x0_clusters.labels_ == label)
-            x0_raw_val = x0_lst[x0_index_of_label][:, 0]
-            x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
-            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
-        for label in x1_uniq_label:
-            if label == -1:
-                continue
-            x1_index_of_label = np.where(x1_clusters.labels_ == label)
-            x1_raw_val = x1_lst[x1_index_of_label][:, 0]
-            x1_new_val = np.max(x1_lst[x1_index_of_label][:, 0])
-            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
-
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 in x0_2_new_val:
-                    line['bbox'][0] = int(x0_2_new_val[x0])
-
-                if x1 in x1_2_new_val:
-                    line['bbox'][2] = int(x1_2_new_val[x1])
-            # 其余对不齐的保持不动
-
-        # 由于修改了block里的line长度，现在需要重新计算block的bbox
-        for block in blocks_in_layoutbox:
-            if len(block['lines']) > 0:
-                block['bbox_fs'] = [
-                    min([line['bbox'][0] for line in block['lines']]),
-                    min([line['bbox'][1] for line in block['lines']]),
-                    max([line['bbox'][2] for line in block['lines']]),
-                    max([line['bbox'][3] for line in block['lines']]),
-                ]
-        """新计算layout的bbox，因为block的bbox变了。"""
-        layout_x0 = min([block['bbox_fs'][0] for block in blocks_in_layoutbox])
-        layout_y0 = min([block['bbox_fs'][1] for block in blocks_in_layoutbox])
-        layout_x1 = max([block['bbox_fs'][2] for block in blocks_in_layoutbox])
-        layout_y1 = max([block['bbox_fs'][3] for block in blocks_in_layoutbox])
-        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
-
-    return new_layout_bboxes
-
-
-def __align_text_in_layout(blocks, layout_bboxes):
-    """由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。"""
-    for layout in layout_bboxes:
-        lb = layout['layout_bbox']
-        blocks_in_layoutbox = [
-            block
-            for block in blocks
-            if block['type'] == BlockType.Text and is_in_layout(block['bbox'], lb)
-        ]
-        if len(blocks_in_layoutbox) == 0:
-            continue
-
-        for block in blocks_in_layoutbox:
-            for line in block.get('lines', []):
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 < lb[0]:
-                    line['bbox'][0] = lb[0]
-                if x1 > lb[2]:
-                    line['bbox'][2] = lb[2]
-
-
-def __common_pre_proc(blocks, layout_bboxes):
-    """不分语言的，对文本进行预处理."""
-    # __add_line_period(blocks, layout_bboxes)
-    __align_text_in_layout(blocks, layout_bboxes)
-    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
-
-    return aligned_layout_bboxes
-
-
-def __pre_proc_zh_blocks(blocks, layout_bboxes):
-    """对中文文本进行分段预处理."""
-    pass
-
-
-def __pre_proc_en_blocks(blocks, layout_bboxes):
-    """对英文文本进行分段预处理."""
-    pass
-
-
-def __group_line_by_layout(blocks, layout_bboxes):
-    """每个layout内的行进行聚合."""
-    # 因为只是一个block一行目前, 一个block就是一个段落
-    blocks_group = []
-    for lyout in layout_bboxes:
-        blocks_in_layout = [
-            block
-            for block in blocks
-            if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])
-        ]
-        blocks_group.append(blocks_in_layout)
-    return blocks_group
-
-
-def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang='en'):
-    """
-    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
-    1. 先计算每个group的左右边界。
-    2. 然后根据行末尾特征进行分段。
-        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
-        且下一行开头不留空白。
-
-    """
-    list_info = []  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-    for blocks in blocks_group:
-        is_start_list = None
-        is_end_list = None
-        if len(blocks) == 0:
-            list_info.append([False, False])
-            continue
-        if blocks[0]['type'] != BlockType.Text and blocks[-1]['type'] != BlockType.Text:
-            list_info.append([False, False])
-            continue
-        if blocks[0]['type'] != BlockType.Text:
-            is_start_list = False
-        if blocks[-1]['type'] != BlockType.Text:
-            is_end_list = False
-
-        lines = [
-            line
-            for block in blocks
-            if block['type'] == BlockType.Text
-            for line in block['lines']
-        ]
-        total_lines = len(lines)
-        if total_lines == 1 or total_lines == 0:
-            list_info.append([False, False])
-            continue
-        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
-                    对齐方式分为以下：
-                    1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
-                        1) 右侧对齐的行，单独成一段
-                        2) 中间对齐的行，按照字体/行高聚合成一段
-                    2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
-                        这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
-        """
-        text_segments, list_start_line = __detect_list_lines(
-            lines, new_layout_bbox, lang
-        )
-        """根据list_range，把lines分成几个部分
-
-        """
-        for list_start in list_start_line:
-            if len(list_start) > 1:
-                for i in range(0, len(list_start)):
-                    index = list_start[i] - 1
-                    if index >= 0:
-                        if 'content' in lines[index]['spans'][-1] and lines[index][
-                            'spans'
-                        ][-1].get('type', '') not in [
-                            ContentType.InlineEquation,
-                            ContentType.InterlineEquation,
-                        ]:
-                            lines[index]['spans'][-1]['content'] += '\n\n'
-        layout_list_info = [
-            False,
-            False,
-        ]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-        for content_type, start, end in text_segments:
-            if content_type == 'list':
-                if start == 0 and is_start_list is None:
-                    layout_list_info[0] = True
-                if end == total_lines - 1 and is_end_list is None:
-                    layout_list_info[1] = True
-
-        list_info.append(layout_list_info)
-    return list_info
-
-
-def __split_para_lines(lines: list, text_blocks: list) -> list:
-    text_paras = []
-    other_paras = []
-    text_lines = []
-    for line in lines:
-        spans_types = [span['type'] for span in line]
-        if ContentType.Table in spans_types:
-            other_paras.append([line])
-            continue
-        if ContentType.Image in spans_types:
-            other_paras.append([line])
-            continue
-        if ContentType.InterlineEquation in spans_types:
-            other_paras.append([line])
-            continue
-        text_lines.append(line)
-
-    for block in text_blocks:
-        block_bbox = block['bbox']
-        para = []
-        for line in text_lines:
-            bbox = line['bbox']
-            if is_in_layout(bbox, block_bbox):
-                para.append(line)
-        if len(para) > 0:
-            text_paras.append(para)
-    paras = other_paras.extend(text_paras)
-    paras_sorted = sorted(paras, key=lambda x: x[0]['bbox'][1])
-    return paras_sorted
-
-
-def __connect_list_inter_layout(
-    blocks_group, new_layout_bbox, layout_list_info, page_num, lang
-):
-    global debug_able
-    """
-    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
-    """
-    if len(blocks_group) == 0 or len(blocks_group) == 0:  # 0的时候最后的return 会出错
-        return blocks_group, [False, False]
-
-    for i in range(1, len(blocks_group)):
-        if len(blocks_group[i]) == 0 or len(blocks_group[i - 1]) == 0:
-            continue
-        pre_layout_list_info = layout_list_info[i - 1]
-        next_layout_list_info = layout_list_info[i]
-        pre_last_para = blocks_group[i - 1][-1].get('lines', [])
-        next_paras = blocks_group[i]
-        next_first_para = next_paras[0]
-
-        if (
-            pre_layout_list_info[1]
-            and not next_layout_list_info[0]
-            and next_first_para['type'] == BlockType.Text
-        ):  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-            if debug_able:
-                logger.info(f'连接page {page_num} 内的list')
-            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-            may_list_lines = []
-            lines = next_first_para.get('lines', [])
-
-            for line in lines:
-                if (
-                    line['bbox'][0]
-                    > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]
-                ):
-                    may_list_lines.append(line)
-                else:
-                    break
-            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-            if (
-                len(may_list_lines) > 0
-                and len(set([x['bbox'][0] for x in may_list_lines])) == 1
-            ):
-                pre_last_para.extend(may_list_lines)
-                next_first_para['lines'] = next_first_para['lines'][
-                    len(may_list_lines) :
-                ]
-
-    return blocks_group, [
-        layout_list_info[0][0],
-        layout_list_info[-1][1],
-    ]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
-
-
-def __connect_list_inter_page(
-    pre_page_paras,
-    next_page_paras,
-    pre_page_layout_bbox,
-    next_page_layout_bbox,
-    pre_page_list_info,
-    next_page_list_info,
-    page_num,
-    lang,
-):
-    """如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO
-    因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。"""
-    if (
-        len(pre_page_paras) == 0 or len(next_page_paras) == 0
-    ):  # 0的时候最后的return 会出错
-        return False
-    if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
-        return False
-    if (
-        pre_page_paras[-1][-1]['type'] != BlockType.Text
-        or next_page_paras[0][0]['type'] != BlockType.Text
-    ):
-        return False
-    if (
-        pre_page_list_info[1] and not next_page_list_info[0]
-    ):  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-        if debug_able:
-            logger.info(f'连接page {page_num} 内的list')
-        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-        may_list_lines = []
-        next_page_first_para = next_page_paras[0][0]
-        if next_page_first_para['type'] == BlockType.Text:
-            lines = next_page_first_para['lines']
-            for line in lines:
-                if (
-                    line['bbox'][0]
-                    > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]
-                ):
-                    may_list_lines.append(line)
-                else:
-                    break
-        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-        if (
-            len(may_list_lines) > 0
-            and len(set([x['bbox'][0] for x in may_list_lines])) == 1
-        ):
-            # pre_page_paras[-1].append(may_list_lines)
-            # 下一页合并到上一页最后一段，打一个cross_page的标签
-            for line in may_list_lines:
-                for span in line['spans']:
-                    span[CROSS_PAGE] = True  # noqa: F405
-            pre_page_paras[-1][-1]['lines'].extend(may_list_lines)
-            next_page_first_para['lines'] = next_page_first_para['lines'][
-                len(may_list_lines) :
-            ]
-            return True
-
-    return False
-
-
-def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
-    """根据line找到所在的layout."""
-    for layout in layout_bboxes:
-        if is_in_layout(line_bbox, layout):
-            return layout
-    return None
-
-
-def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
-    """
-    layout之间进行分段。
-    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
-    连接的条件需要同时满足：
-    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
-    2. 下一行开头不留空白。
-
-    """
-    connected_layout_blocks = []
-    if len(blocks_group) == 0:
-        return connected_layout_blocks
-
-    connected_layout_blocks.append(blocks_group[0])
-    for i in range(1, len(blocks_group)):
-        try:
-            if len(blocks_group[i]) == 0:
-                continue
-            if len(blocks_group[i - 1]) == 0:  # TODO 考虑连接问题，
-                connected_layout_blocks.append(blocks_group[i])
-                continue
-            # text类型的段才需要考虑layout间的合并
-            if (
-                blocks_group[i - 1][-1]['type'] != BlockType.Text
-                or blocks_group[i][0]['type'] != BlockType.Text
-            ):
-                connected_layout_blocks.append(blocks_group[i])
-                continue
-            if (
-                len(blocks_group[i - 1][-1]['lines']) == 0
-                or len(blocks_group[i][0]['lines']) == 0
-            ):
-                connected_layout_blocks.append(blocks_group[i])
-                continue
-            pre_last_line = blocks_group[i - 1][-1]['lines'][-1]
-            next_first_line = blocks_group[i][0]['lines'][0]
-        except Exception:
-            logger.error(f'page layout {i} has no line')
-            continue
-        pre_last_line_text = ''.join(
-            [__get_span_text(span) for span in pre_last_line['spans']]
-        )
-        pre_last_line_type = pre_last_line['spans'][-1]['type']
-        next_first_line_text = ''.join(
-            [__get_span_text(span) for span in next_first_line['spans']]
-        )
-        next_first_line_type = next_first_line['spans'][0]['type']
-        if pre_last_line_type not in [
-            TEXT,
-            INLINE_EQUATION,
-        ] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
-            connected_layout_blocks.append(blocks_group[i])
-            continue
-        pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
-        next_layout = __find_layout_bbox_by_line(
-            next_first_line['bbox'], new_layout_bbox
-        )
-
-        pre_x2_max = pre_layout[2] if pre_layout else -1
-        next_x0_min = next_layout[0] if next_layout else -1
-
-        pre_last_line_text = pre_last_line_text.strip()
-        next_first_line_text = next_first_line_text.strip()
-        if (
-            pre_last_line['bbox'][2] == pre_x2_max
-            and pre_last_line_text
-            and pre_last_line_text[-1] not in LINE_STOP_FLAG
-            and next_first_line['bbox'][0] == next_x0_min
-        ):  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-            connected_layout_blocks[-1][-1]['lines'].extend(blocks_group[i][0]['lines'])
-            blocks_group[i][0][
-                'lines'
-            ] = []  # 删除后一个layout第一个段落中的lines，因为他已经被合并到前一个layout的最后一个段落了
-            blocks_group[i][0][LINES_DELETED] = True  # noqa: F405
-            # if len(layout_paras[i]) == 0:
-            #     layout_paras.pop(i)
-            # else:
-            #     connected_layout_paras.append(layout_paras[i])
-            connected_layout_blocks.append(blocks_group[i])
-        else:
-            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
-            connected_layout_blocks.append(blocks_group[i])
-    return connected_layout_blocks
-
-
-def __connect_para_inter_page(
-    pre_page_paras,
-    next_page_paras,
-    pre_page_layout_bbox,
-    next_page_layout_bbox,
-    page_num,
-    lang,
-):
-    """
-    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
-    是否可以连接的条件：
-    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
-    2. 后一个页面的第一个段落第一行没有空白开头。
-    """
-    # 有的页面可能压根没有文字
-    if (
-        len(pre_page_paras) == 0
-        or len(next_page_paras) == 0
-        or len(pre_page_paras[0]) == 0
-        or len(next_page_paras[0]) == 0
-    ):  # TODO [[]]为什么出现在pre_page_paras里？
-        return False
-    pre_last_block = pre_page_paras[-1][-1]
-    next_first_block = next_page_paras[0][0]
-    if (
-        pre_last_block['type'] != BlockType.Text
-        or next_first_block['type'] != BlockType.Text
-    ):
-        return False
-    if len(pre_last_block['lines']) == 0 or len(next_first_block['lines']) == 0:
-        return False
-    pre_last_para = pre_last_block['lines']
-    next_first_para = next_first_block['lines']
-    pre_last_line = pre_last_para[-1]
-    next_first_line = next_first_para[0]
-    pre_last_line_text = ''.join(
-        [__get_span_text(span) for span in pre_last_line['spans']]
-    )
-    pre_last_line_type = pre_last_line['spans'][-1]['type']
-    next_first_line_text = ''.join(
-        [__get_span_text(span) for span in next_first_line['spans']]
-    )
-    next_first_line_type = next_first_line['spans'][0]['type']
-
-    if pre_last_line_type not in [
-        TEXT,
-        INLINE_EQUATION,
-    ] or next_first_line_type not in [
-        TEXT,
-        INLINE_EQUATION,
-    ]:  # TODO，真的要做好，要考虑跨table, image, 行间的情况
-        # 不是文本，不连接
-        return False
-
-    pre_x2_max_bbox = __find_layout_bbox_by_line(
-        pre_last_line['bbox'], pre_page_layout_bbox
-    )
-    if not pre_x2_max_bbox:
-        return False
-    next_x0_min_bbox = __find_layout_bbox_by_line(
-        next_first_line['bbox'], next_page_layout_bbox
-    )
-    if not next_x0_min_bbox:
-        return False
-
-    pre_x2_max = pre_x2_max_bbox[2]
-    next_x0_min = next_x0_min_bbox[0]
-
-    pre_last_line_text = pre_last_line_text.strip()
-    next_first_line_text = next_first_line_text.strip()
-    if (
-        pre_last_line['bbox'][2] == pre_x2_max
-        and pre_last_line_text[-1] not in LINE_STOP_FLAG
-        and next_first_line['bbox'][0] == next_x0_min
-    ):  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-        # 下一页合并到上一页最后一段，打一个cross_page的标签
-        for line in next_first_para:
-            for span in line['spans']:
-                span[CROSS_PAGE] = True  # noqa: F405
-        pre_last_para.extend(next_first_para)
-
-        # next_page_paras[0].pop(0)  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
-        next_page_paras[0][0]['lines'] = []
-        next_page_paras[0][0][LINES_DELETED] = True  # noqa: F405
-        return True
-    else:
-        return False
-
-
-def find_consecutive_true_regions(input_array):
-    start_index = None  # 连续True区域的起始索引
-    regions = []  # 用于保存所有连续True区域的起始和结束索引
-
-    for i in range(len(input_array)):
-        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
-        if input_array[i] and start_index is None:
-            start_index = i  # 记录连续True区域的起始索引
-
-        # 如果我们找到了一个False值，并且当前在连续True区域中
-        elif not input_array[i] and start_index is not None:
-            # 如果连续True区域长度大于1，那么将其添加到结果列表中
-            if i - start_index > 1:
-                regions.append((start_index, i - 1))
-            start_index = None  # 重置起始索引
-
-    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
-    if start_index is not None and len(input_array) - start_index > 1:
-        regions.append((start_index, len(input_array) - 1))
-
-    return regions
-
-
-def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
-    global debug_able
-    """
-    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
-    一个line居中的条件是：
-    1. 水平中心点跨越layout的中心点。
-    2. 左右两侧都有空白
-    """
-
-    for layout_i, layout_para in enumerate(page_paras):
-        layout_box = new_layout_bbox[layout_i]
-        single_line_paras_tag = []
-        for i in range(len(layout_para)):
-            # single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
-            single_line_paras_tag.append(
-                layout_para[i]['type'] == BlockType.Text
-                and len(layout_para[i]['lines']) == 1
-            )
-        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
-        consecutive_single_line_indices = find_consecutive_true_regions(
-            single_line_paras_tag
-        )
-        if len(consecutive_single_line_indices) > 0:
-            """检查这些行是否是高度相同的，居中的."""
-            for start, end in consecutive_single_line_indices:
-                # start += index_offset
-                # end += index_offset
-                line_hi = np.array(
-                    [
-                        block['lines'][0]['bbox'][3] - block['lines'][0]['bbox'][1]
-                        for block in layout_para[start : end + 1]
-                    ]
-                )
-                first_line_text = ''.join(
-                    [
-                        __get_span_text(span)
-                        for span in layout_para[start]['lines'][0]['spans']
-                    ]
-                )
-                if 'Table' in first_line_text or 'Figure' in first_line_text:
-                    pass
-                if debug_able:
-                    logger.info(line_hi.std())
-
-                if line_hi.std() < 2:
-                    """行高度相同，那么判断是否居中."""
-                    all_left_x0 = [
-                        block['lines'][0]['bbox'][0]
-                        for block in layout_para[start : end + 1]
-                    ]
-                    all_right_x1 = [
-                        block['lines'][0]['bbox'][2]
-                        for block in layout_para[start : end + 1]
-                    ]
-                    layout_center = (layout_box[0] + layout_box[2]) / 2
-                    if (
-                        all(
-                            [
-                                x0 < layout_center < x1
-                                for x0, x1 in zip(all_left_x0, all_right_x1)
-                            ]
-                        )
-                        and not all([x0 == layout_box[0] for x0 in all_left_x0])
-                        and not all([x1 == layout_box[2] for x1 in all_right_x1])
-                    ):
-                        merge_para = [
-                            block['lines'][0] for block in layout_para[start : end + 1]
-                        ]
-                        para_text = ''.join(
-                            [
-                                __get_span_text(span)
-                                for line in merge_para
-                                for span in line['spans']
-                            ]
-                        )
-                        if debug_able:
-                            logger.info(para_text)
-                        layout_para[start]['lines'] = merge_para
-                        for i_para in range(start + 1, end + 1):
-                            layout_para[i_para]['lines'] = []
-                            layout_para[i_para][LINES_DELETED] = True  # noqa: F405
-                        # layout_para[start:end + 1] = [merge_para]
-
-                        # index_offset -= end - start
-
-    return
-
-
-def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
-    """找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。"""
-
-    pass
-
-
-def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
-    """根据line和layout情况进行分段 先实现一个根据行末尾特征分段的简单方法。"""
-    """
-    算法思路：
-    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
-    2. 从上述行中找到末尾是句号等可作为断行标志的行。
-    3. 参照上述行尾特征进行分段。
-    4. 图、表，目前独占一行，不考虑分段。
-    """
-    blocks_group = __group_line_by_layout(blocks, layout_bboxes)  # block内分段
-    layout_list_info = __split_para_in_layoutbox(
-        blocks_group, new_layout_bbox, lang
-    )  # layout内分段
-    blocks_group, page_list_info = __connect_list_inter_layout(
-        blocks_group, new_layout_bbox, layout_list_info, page_num, lang
-    )  # layout之间连接列表段落
-    connected_layout_blocks = __connect_para_inter_layoutbox(
-        blocks_group, new_layout_bbox
-    )  # layout间链接段落
-
-    return connected_layout_blocks, page_list_info
-
-
-def para_split(pdf_info_dict, debug_mode, lang='en'):
-    global debug_able
-    debug_able = debug_mode
-    new_layout_of_pages = []  # 数组的数组，每个元素是一个页面的layoutS
-    all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
-    for page_num, page in pdf_info_dict.items():
-        blocks = copy.deepcopy(page['preproc_blocks'])
-        layout_bboxes = page['layout_bboxes']
-        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
-        new_layout_of_pages.append(new_layout_bbox)
-        splited_blocks, page_list_info = __do_split_page(
-            blocks, layout_bboxes, new_layout_bbox, page_num, lang
-        )
-        all_page_list_info.append(page_list_info)
-        page['para_blocks'] = splited_blocks
-
-    """连接页面与页面之间的可能合并的段落"""
-    pdf_infos = list(pdf_info_dict.values())
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        if page_num == 0:
-            continue
-        pre_page_paras = pdf_infos[page_num - 1]['para_blocks']
-        next_page_paras = pdf_infos[page_num]['para_blocks']
-        pre_page_layout_bbox = new_layout_of_pages[page_num - 1]
-        next_page_layout_bbox = new_layout_of_pages[page_num]
-
-        is_conn = __connect_para_inter_page(
-            pre_page_paras,
-            next_page_paras,
-            pre_page_layout_bbox,
-            next_page_layout_bbox,
-            page_num,
-            lang,
-        )
-        if debug_able:
-            if is_conn:
-                logger.info(f'连接了第{page_num - 1}页和第{page_num}页的段落')
-
-        is_list_conn = __connect_list_inter_page(
-            pre_page_paras,
-            next_page_paras,
-            pre_page_layout_bbox,
-            next_page_layout_bbox,
-            all_page_list_info[page_num - 1],
-            all_page_list_info[page_num],
-            page_num,
-            lang,
-        )
-        if debug_able:
-            if is_list_conn:
-                logger.info(f'连接了第{page_num - 1}页和第{page_num}页的列表段落')
-
-    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
-    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
-    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
-    """
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        page_paras = page['para_blocks']
-        new_layout_bbox = new_layout_of_pages[page_num]
-        __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang)
-        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
-
-    # layout展平
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        page_paras = page['para_blocks']
-        page_blocks = [block for layout in page_paras for block in layout]
-        page['para_blocks'] = page_blocks
--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
                    first_span = first_line['spans'][0]
                    if len(first_span['content']) > 0:
                        span_start_with_num = first_span['content'][0].isdigit()
+                        span_start_with_big_char = first_span['content'][0].isupper()
                        if (
-                            abs(block2['bbox_fs'][2] - last_line['bbox'][2])
-                            < line_height
+                            # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
+                            abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
+                            # 上一个block的最后一个span不是以特定符号结尾
                            and not last_span['content'].endswith(LINE_STOP_FLAG)
                            # 两个block宽度差距超过2倍也不合并
                            and abs(block1_weight - block2_weight) < min_block_weight
+                            # 下一个block的第一个字符是数字
                            and not span_start_with_num
+                            # 下一个block的第一个字符是大写字母
+                            and not span_start_with_big_char
                        ):
                            if block1['page_num'] != block2['page_num']:
                                for line in block1['lines']:

--- a/magic_pdf/para/raw_processor.py
+++ b/magic_pdf/para/raw_processor.py
-class RawBlockProcessor:
-    def __init__(self) -> None:
-        self.y_tolerance = 2
-        self.pdf_dic = {}
-
-    def __span_flags_decomposer(self, span_flags):
-        """
-        Make font flags human readable.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-
-        span_flags : int
-            span flags
-
-        Returns
-        -------
-        l : dict
-            decomposed flags
-        """
-
-        l = {
-            "is_superscript": False,
-            "is_italic": False,
-            "is_serifed": False,
-            "is_sans_serifed": False,
-            "is_monospaced": False,
-            "is_proportional": False,
-            "is_bold": False,
-        }
-
-        if span_flags & 2**0:
-            l["is_superscript"] = True  # 表示上标
-
-        if span_flags & 2**1:
-            l["is_italic"] = True  # 表示斜体
-
-        if span_flags & 2**2:
-            l["is_serifed"] = True  # 表示衬线字体
-        else:
-            l["is_sans_serifed"] = True  # 表示非衬线字体
-
-        if span_flags & 2**3:
-            l["is_monospaced"] = True  # 表示等宽字体
-        else:
-            l["is_proportional"] = True  # 表示比例字体
-
-        if span_flags & 2**4:
-            l["is_bold"] = True  # 表示粗体
-
-        return l
-
-    def __make_new_lines(self, raw_lines):
-        """
-        This function makes new lines.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-
-        raw_lines : list
-            raw lines
-
-        Returns
-        -------
-        new_lines : list
-            new lines
-        """
-        new_lines = []
-        new_line = None
-
-        for raw_line in raw_lines:
-            raw_line_bbox = raw_line["bbox"]
-            raw_line_spans = raw_line["spans"]
-            raw_line_text = "".join([span["text"] for span in raw_line_spans])
-            raw_line_dir = raw_line.get("dir", None)
-
-            decomposed_line_spans = []
-            for span in raw_line_spans:
-                raw_flags = span["flags"]
-                decomposed_flags = self.__span_flags_decomposer(raw_flags)
-                span["decomposed_flags"] = decomposed_flags
-                decomposed_line_spans.append(span)
-
-            if new_line is None:
-                new_line = {
-                    "bbox": raw_line_bbox,
-                    "text": raw_line_text,
-                    "dir": raw_line_dir if raw_line_dir else (0, 0),
-                    "spans": decomposed_line_spans,
-                }
-            else:
-                if (
-                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
-                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
-                ):
-                    new_line["bbox"] = (
-                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
-                        new_line["bbox"][1],  # top
-                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
-                        raw_line_bbox[3],  # bottom
-                    )
-                    new_line["text"] += " " + raw_line_text
-                    new_line["spans"].extend(raw_line_spans)
-                    new_line["dir"] = (
-                        new_line["dir"][0] + raw_line_dir[0],
-                        new_line["dir"][1] + raw_line_dir[1],
-                    )
-                else:
-                    new_lines.append(new_line)
-                    new_line = {
-                        "bbox": raw_line_bbox,
-                        "text": raw_line_text,
-                        "dir": raw_line_dir if raw_line_dir else (0, 0),
-                        "spans": raw_line_spans,
-                    }
-        if new_line:
-            new_lines.append(new_line)
-
-        return new_lines
-
-    def __make_new_block(self, raw_block):
-        """
-        This function makes a new block.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        raw_block : dict
-            a raw block
-
-        Returns
-        -------
-        new_block : dict
-
-        Schema of new_block:
-        {
-            "block_id": "block_1",
-            "bbox": [0, 0, 100, 100],
-            "text": "This is a block.",
-            "lines": [
-                {
-                    "bbox": [0, 0, 100, 100],
-                    "text": "This is a line.",
-                    "spans": [
-                        {
-                            "text": "This is a span.",
-                            "font": "Times New Roman",
-                            "size": 12,
-                            "color": "#000000",
-                        }
-                    ],
-                }
-            ],
-        }
-        """
-        new_block = {}
-
-        block_id = raw_block["number"]
-        block_bbox = raw_block["bbox"]
-        block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
-        raw_lines = raw_block["lines"]
-        block_lines = self.__make_new_lines(raw_lines)
-
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["lines"] = block_lines
-
-        return new_block
-
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
-
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "preproc_blocks" in blocks.keys():
-                    input_blocks = blocks["preproc_blocks"]
-                    for raw_block in input_blocks:
-                        new_block = self.__make_new_block(raw_block)
-                        para_blocks.append(new_block)
-
-                blocks["para_blocks"] = para_blocks
-
-        return pdf_dic
-
--- a/magic_pdf/para/stats.py
+++ b/magic_pdf/para/stats.py
-from collections import Counter
-import numpy as np
-
-from magic_pdf.para.commons import *
-
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-class BlockStatisticsCalculator:
-    def __init__(self) -> None:
-        pass
-
-    def __calc_stats_of_new_lines(self, new_lines):
-        """
-        This function calculates the paragraph metrics
-
-        Parameters
-        ----------
-        combined_lines : list
-            combined lines
-
-        Returns
-        -------
-        X0 : float
-            Median of x0 values, which represents the left average boundary of the block
-        X1 : float
-            Median of x1 values, which represents the right average boundary of the block
-        avg_char_width : float
-            Average of char widths, which represents the average char width of the block
-        avg_char_height : float
-            Average of line heights, which represents the average line height of the block
-
-        """
-        x0_values = []
-        x1_values = []
-        char_widths = []
-        char_heights = []
-
-        block_font_types = []
-        block_font_sizes = []
-        block_directions = []
-
-        if len(new_lines) > 0:
-            for i, line in enumerate(new_lines):
-                line_bbox = line["bbox"]
-                line_text = line["text"]
-                line_spans = line["spans"]
-
-                num_chars = len([ch for ch in line_text if not ch.isspace()])
-
-                x0_values.append(line_bbox[0])
-                x1_values.append(line_bbox[2])
-
-                if num_chars > 0:
-                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
-                    char_widths.append(char_width)
-
-                for span in line_spans:
-                    block_font_types.append(span["font"])
-                    block_font_sizes.append(span["size"])
-
-                if "dir" in line:
-                    block_directions.append(line["dir"])
-
-                # line_font_types = [span["font"] for span in line_spans]
-                char_heights = [span["size"] for span in line_spans]
-
-        X0 = np.median(x0_values) if x0_values else 0
-        X1 = np.median(x1_values) if x1_values else 0
-        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
-        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
-
-        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
-
-        max_span_length = 0
-        max_span_font_type = None
-        for line in new_lines:
-            line_spans = line["spans"]
-            for span in line_spans:
-                span_length = span["bbox"][2] - span["bbox"][0]
-                if span_length > max_span_length:
-                    max_span_length = span_length
-                    max_span_font_type = span["font"]
-
-        max_freq_font_type = max_span_font_type
-
-        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
-
-        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
-        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
-
-        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
-
-        return (
-            X0,
-            X1,
-            avg_char_width,
-            avg_char_height,
-            max_freq_font_type,
-            avg_font_size,
-            (avg_dir_horizontal, avg_dir_vertical),
-            median_font_size,
-        )
-
-    def __make_new_block(self, input_block):
-        new_block = {}
-
-        raw_lines = input_block["lines"]
-        stats = self.__calc_stats_of_new_lines(raw_lines)
-
-        block_id = input_block["block_id"]
-        block_bbox = input_block["bbox"]
-        block_text = input_block["text"]
-        block_lines = raw_lines
-        block_avg_left_boundary = stats[0]
-        block_avg_right_boundary = stats[1]
-        block_avg_char_width = stats[2]
-        block_avg_char_height = stats[3]
-        block_font_type = stats[4]
-        block_font_size = stats[5]
-        block_direction = stats[6]
-        block_median_font_size = stats[7]
-
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["dir"] = block_direction
-        new_block["X0"] = block_avg_left_boundary
-        new_block["X1"] = block_avg_right_boundary
-        new_block["avg_char_width"] = block_avg_char_width
-        new_block["avg_char_height"] = block_avg_char_height
-        new_block["block_font_type"] = block_font_type
-        new_block["block_font_size"] = block_font_size
-        new_block["lines"] = block_lines
-        new_block["median_font_size"] = block_median_font_size
-
-        return new_block
-
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
-
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self.__make_new_block(input_block)
-                        para_blocks.append(new_block)
-
-                blocks["para_blocks"] = para_blocks
-
-        return pdf_dic
-
-
-class DocStatisticsCalculator:
-    def __init__(self) -> None:
-        pass
-
-    def calc_stats_of_doc(self, pdf_dict):
-        """
-        This function computes the statistics of the document
-
-        Parameters
-        ----------
-        result_dict : dict
-            result dictionary
-
-        Returns
-        -------
-        statistics : dict
-            statistics of the document
-        """
-
-        total_text_length = 0
-        total_num_blocks = 0
-
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    para_blocks = blocks["para_blocks"]
-                    for para_block in para_blocks:
-                        total_text_length += len(para_block["text"])
-                        total_num_blocks += 1
-
-        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
-
-        font_list = []
-
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        block_text_length = len(input_block.get("text", ""))
-                        if block_text_length < avg_text_length * 0.5:
-                            continue
-                        block_font_type = safe_get(input_block, "block_font_type", "")
-                        block_font_size = safe_get(input_block, "block_font_size", 0)
-                        font_list.append((block_font_type, block_font_size))
-
-        font_counter = Counter(font_list)
-        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
-        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
-
-        statistics = {
-            "num_pages": 0,
-            "num_blocks": 0,
-            "num_paras": 0,
-            "num_titles": 0,
-            "num_header_blocks": 0,
-            "num_footer_blocks": 0,
-            "num_watermark_blocks": 0,
-            "num_vertical_margin_note_blocks": 0,
-            "most_common_font_type": most_common_font[0][0],
-            "most_common_font_size": most_common_font[0][1],
-            "number_of_most_common_font": most_common_font[1],
-            "second_most_common_font_type": second_most_common_font[0][0],
-            "second_most_common_font_size": second_most_common_font[0][1],
-            "number_of_second_most_common_font": second_most_common_font[1],
-            "avg_text_length": avg_text_length,
-        }
-
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                blocks = pdf_dict[page_id]["para_blocks"]
-                statistics["num_pages"] += 1
-                for block_id, block_data in enumerate(blocks):
-                    statistics["num_blocks"] += 1
-
-                    if "paras" in block_data.keys():
-                        statistics["num_paras"] += len(block_data["paras"])
-
-                    for line in block_data["lines"]:
-                        if line.get("is_title", 0):
-                            statistics["num_titles"] += 1
-
-                    if block_data.get("is_header", 0):
-                        statistics["num_header_blocks"] += 1
-                    if block_data.get("is_footer", 0):
-                        statistics["num_footer_blocks"] += 1
-                    if block_data.get("is_watermark", 0):
-                        statistics["num_watermark_blocks"] += 1
-                    if block_data.get("is_vertical_margin_note", 0):
-                        statistics["num_vertical_margin_note_blocks"] += 1
-
-        pdf_dict["statistics"] = statistics
-
-        return pdf_dict
-
-
--- a/magic_pdf/para/title_processor.py
+++ b/magic_pdf/para/title_processor.py
-import os
-import re
-import numpy as np
-
-from magic_pdf.libs.nlp_utils import NLPModels
-
-from magic_pdf.para.commons import *
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-class TitleProcessor:
-    def __init__(self, *doc_statistics) -> None:
-        if len(doc_statistics) > 0:
-            self.doc_statistics = doc_statistics[0]
-
-        self.nlp_model = NLPModels()
-        self.MAX_TITLE_LEVEL = 3
-        self.numbered_title_pattern = r"""
-            ^                                 # 行首
-            (                                 # 开始捕获组
-                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
-                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
-                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
-                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
-                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
-                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
-                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
-                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
-                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
-                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
-                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
-                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
-                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
-                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
-                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
-                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
-                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
-            )                                 # 结束捕获组
-            .+                                # 标题的其余部分
-        """
-
-    def _is_potential_title(
-        self,
-        curr_line,
-        prev_line,
-        prev_line_is_title,
-        next_line,
-        avg_char_width,
-        avg_char_height,
-        median_font_size,
-    ):
-        """
-        This function checks if the line is a potential title.
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        avg_char_width : float
-            average of char widths
-        avg_char_height : float
-            average of line heights
-
-        Returns
-        -------
-        bool
-            True if the line is a potential title, False otherwise.
-        """
-
-        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
-            """
-            This function checks if the line is centered on the page
-
-            Parameters
-            ----------
-            line_bbox : list
-                bbox of the line
-            page_bbox : list
-                bbox of the page
-            avg_char_width : float
-                average of char widths
-
-            Returns
-            -------
-            bool
-                True if the line is centered on the page, False otherwise.
-            """
-            horizontal_ratio = 0.5
-            horizontal_thres = horizontal_ratio * avg_char_width
-
-            x0, _, x1, _ = line_bbox
-            _, _, page_x1, _ = page_bbox
-
-            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
-
-        def __is_bold_font_line(line):
-            """
-            Check if a line contains any bold font style.
-            """
-
-            def _is_bold_span(span):
-                # if span text is empty or only contains space, return False
-                if not span["text"].strip():
-                    return False
-
-                return bool(span["flags"] & 2**4)  # Check if the font is bold
-
-            for span in line["spans"]:
-                if not _is_bold_span(span):
-                    return False
-
-            return True
-
-        def __is_italic_font_line(line):
-            """
-            Check if a line contains any italic font style.
-            """
-
-            def __is_italic_span(span):
-                return bool(span["flags"] & 2**1)  # Check if the font is italic
-
-            for span in line["spans"]:
-                if not __is_italic_span(span):
-                    return False
-
-            return True
-
-        def __is_punctuation_heavy(line_text):
-            """
-            Check if the line contains a high ratio of punctuation marks, which may indicate
-            that the line is not a title.
-
-            Parameters:
-            line_text (str): Text of the line.
-
-            Returns:
-            bool: True if the line is heavy with punctuation, False otherwise.
-            """
-            # Pattern for common title format like "X.Y. Title"
-            pattern = r"\b\d+\.\d+\..*\b"
-
-            # If the line matches the title format, return False
-            if re.match(pattern, line_text.strip()):
-                return False
-
-            # Find all punctuation marks in the line
-            punctuation_marks = re.findall(r"[^\w\s]", line_text)
-            number_of_punctuation_marks = len(punctuation_marks)
-
-            text_length = len(line_text)
-
-            if text_length == 0:
-                return False
-
-            punctuation_ratio = number_of_punctuation_marks / text_length
-            if punctuation_ratio >= 0.1:
-                return True
-
-            return False
-
-        def __has_mixed_font_styles(spans, strict_mode=False):
-            """
-            This function checks if the line has mixed font styles, the strict mode will compare the font types
-
-            Parameters
-            ----------
-            spans : list
-                spans of the line
-            strict_mode : bool
-                True for strict mode, the font types will be fully compared
-                False for non-strict mode, the font types will be compared by the most longest common prefix
-
-            Returns
-            -------
-            bool
-                True if the line has mixed font styles, False otherwise.
-            """
-            if strict_mode:
-                font_styles = set()
-                for span in spans:
-                    font_style = span["font"].lower()
-                    font_styles.add(font_style)
-
-                return len(font_styles) > 1
-
-            else:  # non-strict mode
-                font_styles = []
-                for span in spans:
-                    font_style = span["font"].lower()
-                    font_styles.append(font_style)
-
-                if len(font_styles) > 1:
-                    longest_common_prefix = os.path.commonprefix(font_styles)
-                    if len(longest_common_prefix) > 0:
-                        return False
-                    else:
-                        return True
-                else:
-                    return False
-
-        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
-            """
-            This function checks if the current line has a different font type from the previous and next lines
-
-            Parameters
-            ----------
-            curr_line_font_type : str
-                font type of the current line
-            prev_line_font_type : str
-                font type of the previous line
-            next_line_font_type : str
-                font type of the next line
-
-            Returns
-            -------
-            bool
-                True if the current line has a different font type from the previous and next lines, False otherwise.
-            """
-            return all(
-                curr_line_font_type != other_font_type.lower()
-                for other_font_type in [prev_line_font_type, next_line_font_type]
-                if other_font_type is not None
-            )
-
-        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
-            """
-            This function checks if the current line has a larger font size than the previous and next lines
-
-            Parameters
-            ----------
-            curr_line_font_size : float
-                font size of the current line
-            prev_line_font_size : float
-                font size of the previous line
-            next_line_font_size : float
-                font size of the next line
-
-            Returns
-            -------
-            bool
-                True if the current line has a larger font size than the previous and next lines, False otherwise.
-            """
-            return all(
-                curr_line_font_size > other_font_size * 1.2
-                for other_font_size in [prev_line_font_size, next_line_font_size]
-                if other_font_size is not None
-            )
-
-        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
-            """
-            This function checks if the current line is similar to the previous line
-
-            Parameters
-            ----------
-            curr_line : dict
-                current line
-            prev_line : dict
-                previous line
-
-            Returns
-            -------
-            bool
-                True if the current line is similar to the previous line, False otherwise.
-            """
-
-            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
-                return True
-            else:
-                return False
-
-        def __is_same_font_type_of_docAvg(curr_line_font_type):
-            """
-            This function checks if the current line has the same font type as the document average font type
-
-            Parameters
-            ----------
-            curr_line_font_type : str
-                font type of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line has the same font type as the document average font type, False otherwise.
-            """
-            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
-            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
-
-            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
-
-        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
-            """
-            This function checks if the current line has a large enough font size
-
-            Parameters
-            ----------
-            curr_line_font_size : float
-                font size of the current line
-            ratio : float
-                ratio of the current line font size to the document average font size
-
-            Returns
-            -------
-            bool
-                True if the current line has a large enough font size, False otherwise.
-            """
-            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
-            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
-            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
-
-            return curr_line_font_size >= doc_avg_font_size * ratio
-
-        def __is_sufficient_spacing_above_and_below(
-            curr_line_bbox,
-            prev_line_bbox,
-            next_line_bbox,
-            avg_char_height,
-            median_font_size,
-        ):
-            """
-            This function checks if the current line has sufficient spacing above and below
-
-            Parameters
-            ----------
-            curr_line_bbox : list
-                bbox of the current line
-            prev_line_bbox : list
-                bbox of the previous line
-            next_line_bbox : list
-                bbox of the next line
-            avg_char_width : float
-                average of char widths
-            avg_char_height : float
-                average of line heights
-
-            Returns
-            -------
-            bool
-                True if the current line has sufficient spacing above and below, False otherwise.
-            """
-            vertical_ratio = 1.25
-            vertical_thres = vertical_ratio * median_font_size
-
-            _, y0, _, y1 = curr_line_bbox
-
-            sufficient_spacing_above = False
-            if prev_line_bbox:
-                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
-                sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            else:
-                sufficient_spacing_above = True
-
-            sufficient_spacing_below = False
-            if next_line_bbox:
-                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
-                sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            else:
-                sufficient_spacing_below = True
-
-            return (sufficient_spacing_above, sufficient_spacing_below)
-
-        def __is_word_list_line_by_rules(curr_line_text):
-            """
-            This function checks if the current line is a word list
-
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line is a name list, False otherwise.
-            """
-            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
-            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
-
-            compiled_pattern = re.compile(name_list_pattern)
-
-            if compiled_pattern.search(curr_line_text):
-                return True
-            else:
-                return False
-
-        # """
-        def __get_text_catgr_by_nlp(curr_line_text):
-            """
-            This function checks if the current line is a name list using nlp model, such as spacy
-
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line is a name list, False otherwise.
-            """
-
-            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
-
-            return result
-
-        # """
-
-        def __is_numbered_title(curr_line_text):
-            """
-            This function checks if the current line is a numbered list
-
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line is a numbered list, False otherwise.
-            """
-
-            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
-
-            if compiled_pattern.search(curr_line_text):
-                return True
-            else:
-                return False
-
-        def __is_end_with_ending_puncs(line_text):
-            """
-            This function checks if the current line ends with a ending punctuation mark
-
-            Parameters
-            ----------
-            line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line ends with a punctuation mark, False otherwise.
-            """
-            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-
-            line_text = line_text.rstrip()
-            if line_text[-1] in end_puncs:
-                return True
-
-            return False
-
-        def __contains_only_no_meaning_symbols(line_text):
-            """
-            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
-            Situation contains:
-            1. Only have punctuation marks
-            2. Only have other non-meaning symbols
-
-            Parameters
-            ----------
-            line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line contains only symbols that have no meaning, False otherwise.
-            """
-
-            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
-            number_of_punctuation_marks = len(punctuation_marks)
-
-            text_length = len(line_text)
-
-            if text_length == 0:
-                return False
-
-            punctuation_ratio = number_of_punctuation_marks / text_length
-            if punctuation_ratio >= 0.9:
-                return True
-
-            return False
-
-        def __is_equation(line_text):
-            """
-            This function checks if the current line is an equation.
-
-            Parameters
-            ----------
-            line_text : str
-
-            Returns
-            -------
-            bool
-                True if the current line is an equation, False otherwise.
-            """
-            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
-
-            if re.search(equation_reg, line_text):
-                return True
-            else:
-                return False
-
-        def __is_title_by_len(text, max_length=200):
-            """
-            This function checks if the current line is a title by length.
-
-            Parameters
-            ----------
-            text : str
-                text of the current line
-
-            max_length : int
-                max length of the title
-
-            Returns
-            -------
-            bool
-                True if the current line is a title, False otherwise.
-
-            """
-            text = text.strip()
-            return len(text) <= max_length
-
-        def __compute_line_font_type_and_size(curr_line):
-            """
-            This function computes the font type and font size of the line.
-
-            Parameters
-            ----------
-            line : dict
-                line
-
-            Returns
-            -------
-            font_type : str
-                font type of the line
-            font_size : float
-                font size of the line
-            """
-            spans = curr_line["spans"]
-            max_accumulated_length = 0
-            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
-            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
-            for span in spans:
-                if span["text"].isspace():
-                    continue
-                span_length = span["bbox"][2] - span["bbox"][0]
-                if span_length > max_accumulated_length:
-                    max_accumulated_length = span_length
-                    max_span_font_size = span["size"]
-                    max_span_font_type = span["font"].lower()
-
-            return max_span_font_type, max_span_font_size
-
-        """
-        Title detecting main Process.
-        """
-
-        """
-        Basic features about the current line.
-        """
-        curr_line_bbox = curr_line["bbox"]
-        curr_line_text = curr_line["text"]
-        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
-
-        if len(curr_line_text.strip()) == 0:  # skip empty lines
-            return False
-
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        if prev_line:
-            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
-        else:
-            prev_line_font_type, prev_line_font_size = None, None
-
-        next_line_bbox = next_line["bbox"] if next_line else None
-        if next_line:
-            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
-        else:
-            next_line_font_type, next_line_font_size = None, None
-
-        """
-        Aggregated features about the current line.
-        """
-        is_italc_font = __is_italic_font_line(curr_line)
-        is_bold_font = __is_bold_font_line(curr_line)
-
-        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
-        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
-        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
-
-        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
-
-        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
-
-        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
-        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
-
-        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
-
-        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
-        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
-
-        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
-            curr_line_font_size, prev_line_font_size, next_line_font_size
-        )
-
-        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
-            curr_line_font_type, prev_line_font_type, next_line_font_type
-        )
-
-        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
-        )
-
-        is_similar_to_pre_line = __is_similar_to_pre_line(
-            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
-        )
-
-        """
-        Further aggregated features about the current line.
-        
-        Attention:
-            Features that start with __ are for internal use.
-        """
-
-        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
-        )
-        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
-        is_a_left_inline_title = (
-            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
-        )
-
-        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
-        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
-
-        is_title_by_check_pre_and_next_line = (
-            (prev_line is not None or next_line is not None)
-            and has_sufficient_spaces_above
-            and has_sufficient_spaces_below
-            and is_potential_title_font
-        )
-
-        is_numbered_title = __is_numbered_title(curr_line_text) and (
-            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
-        )
-
-        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
-
-        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
-
-        is_equation = __is_equation(curr_line_text)
-
-        is_title_by_len = __is_title_by_len(curr_line_text)
-
-        """
-        Decide if the line is a title.
-        """
-        # is_title = False
-        # if prev_line_is_title:
-
-        is_title = (
-            is_not_end_with_ending_puncs  # not end with ending punctuation marks
-            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
-            and is_title_by_len  # is a title by length, default max length is 200
-            and not is_equation  # an interline equation should never be a title
-            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
-            and (
-                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
-                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                or (
-                    is_much_larger_font_than_doc_avg
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-                or (
-                    is_font_size_little_less_than_doc_avg
-                    and is_bold_font
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-            )  # not the same font type as the document average font type, which includes the most common font type and the second most common font type
-            and (
-                (
-                    not is_person_or_org_list_line_by_nlp
-                    and (
-                        is_much_larger_font_than_doc_avg
-                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
-                    )
-                )
-                or (
-                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
-                    and not is_a_left_inline_title
-                    and not is_punctuation_heavy
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-                or (
-                    is_person_or_org_list_line_by_nlp
-                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                )
-                or (is_numbered_title and not is_a_left_inline_title)
-            )
-        )
-        # ) or (is_similar_to_pre_line and prev_line_is_title)
-
-        is_name_or_org_list_to_be_removed = (
-            (is_person_or_org_list_line_by_nlp)
-            and is_punctuation_heavy
-            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-        ) and not is_title
-
-        if is_name_or_org_list_to_be_removed:
-            is_author_or_org_list = True
-            # print curr_line_text to check
-            # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
-        else:
-            is_author_or_org_list = False
-        """
-        # print reason why the line is a title
-        if is_title:
-            print_green("This line is a title.")
-            print_green("↓" * 10)
-            print()
-            print("curr_line_text: ", curr_line_text)
-            print()
-
-        # print reason why the line is not a title
-        line_text = curr_line_text.strip()
-        test_text = "Career/Personal Life"
-        text_content_condition = line_text == test_text
-        
-        if not is_title and text_content_condition: # Print specific line
-        # if not is_title: # Print each line
-            print_red("This line is not a title.")
-            print_red("↓" * 10)
-
-            print()
-            print("curr_line_text: ", curr_line_text)
-            print()
-
-            if is_not_end_with_ending_puncs:
-                print_green(f"is_not_end_with_ending_puncs")
-            else:
-                print_red(f"is_end_with_ending_puncs")
-
-            if is_not_only_no_meaning_symbols:
-                print_green(f"is_not_only_no_meaning_symbols")
-            else:
-                print_red(f"is_only_no_meaning_symbols")
-
-            if is_title_by_len:
-                print_green(f"is_title_by_len: {is_title_by_len}")
-            else:
-                print_red(f"is_not_title_by_len: {is_title_by_len}")
-
-            if is_equation:
-                print_red(f"is_equation")
-            else:
-                print_green(f"is_not_equation")
-
-            if is_potential_title_font:
-                print_green(f"is_potential_title_font")
-            else:
-                print_red(f"is_not_potential_title_font")
-
-            if is_punctuation_heavy:
-                print_red("is_punctuation_heavy")
-            else:
-                print_green("is_not_punctuation_heavy")
-
-            if is_bold_font:
-                print_green(f"is_bold_font")
-            else:
-                print_red(f"is_not_bold_font")
-
-            if is_font_size_not_less_than_doc_avg:
-                print_green(f"is_larger_font_than_doc_avg")
-            else:
-                print_red(f"is_not_larger_font_than_doc_avg")
-
-            if is_much_larger_font_than_doc_avg:
-                print_green(f"is_much_larger_font_than_doc_avg")
-            else:
-                print_red(f"is_not_much_larger_font_than_doc_avg")
-
-            if is_not_same_font_type_of_docAvg:
-                print_green(f"is_not_same_font_type_of_docAvg")
-            else:
-                print_red(f"is_same_font_type_of_docAvg")
-
-            if is_word_list_line_by_rules:
-                print_red("is_word_list_line_by_rules")
-            else:
-                print_green("is_not_name_list_by_rules")
-
-            if is_person_or_org_list_line_by_nlp:
-                print_red("is_person_or_org_list_line_by_nlp")
-            else:
-                print_green("is_not_person_or_org_list_line_by_nlp")
-
-            if not is_numbered_title:
-                print_red("is_not_numbered_title")
-            else:
-                print_green("is_numbered_title")
-
-            if is_a_left_inline_title:
-                print_red("is_a_left_inline_title")
-            else:
-                print_green("is_not_a_left_inline_title")
-
-            if not is_title_by_check_prev_line:
-                print_red("is_not_title_by_check_prev_line")
-            else:
-                print_green("is_title_by_check_prev_line")
-
-            if not is_title_by_check_next_line:
-                print_red("is_not_title_by_check_next_line")
-            else:
-                print_green("is_title_by_check_next_line")
-
-            if not is_title_by_check_pre_and_next_line:
-                print_red("is_not_title_by_check_pre_and_next_line")
-            else:
-                print_green("is_title_by_check_pre_and_next_line")
-
-        # print_green("Common features:")
-        # print_green("↓" * 10)
-
-        # print(f"    curr_line_font_type: {curr_line_font_type}")
-        # print(f"    curr_line_font_size: {curr_line_font_size}")
-        # print()
-
-        """
-
-        return is_title, is_author_or_org_list
-
-    def _detect_block_title(self, input_block):
-        """
-        Use the functions 'is_potential_title' to detect titles of each paragraph block.
-        If a line is a title, then the value of key 'is_title' of the line will be set to True.
-        """
-
-        raw_lines = input_block["lines"]
-
-        prev_line_is_title_flag = False
-
-        for i, curr_line in enumerate(raw_lines):
-            prev_line = raw_lines[i - 1] if i > 0 else None
-            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
-
-            blk_avg_char_width = input_block["avg_char_width"]
-            blk_avg_char_height = input_block["avg_char_height"]
-            blk_media_font_size = input_block["median_font_size"]
-
-            is_title, is_author_or_org_list = self._is_potential_title(
-                curr_line,
-                prev_line,
-                prev_line_is_title_flag,
-                next_line,
-                blk_avg_char_width,
-                blk_avg_char_height,
-                blk_media_font_size,
-            )
-
-            if is_title:
-                curr_line["is_title"] = is_title
-                prev_line_is_title_flag = True
-            else:
-                curr_line["is_title"] = False
-                prev_line_is_title_flag = False
-
-            if is_author_or_org_list:
-                curr_line["is_author_or_org_list"] = is_author_or_org_list
-            else:
-                curr_line["is_author_or_org_list"] = False
-
-        return input_block
-
-    def batch_process_blocks_detect_titles(self, pdf_dic):
-        """
-        This function batch process the blocks to detect titles.
-
-        Parameters
-        ----------
-        pdf_dict : dict
-            result dictionary
-
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-        num_titles = 0
-
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in blocks.keys():
-                    para_blocks = blocks["para_blocks"]
-
-                    all_single_line_blocks = []
-                    for block in para_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-
-                    new_para_blocks = []
-                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
-                        for para_block in para_blocks:
-                            new_block = self._detect_block_title(para_block)
-                            new_para_blocks.append(new_block)
-                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
-                    else:  # All blocks are single line blocks.
-                        for para_block in para_blocks:
-                            new_para_blocks.append(para_block)
-                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
-                    para_blocks = new_para_blocks
-
-                blocks["para_blocks"] = para_blocks
-
-                for para_block in para_blocks:
-                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
-                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
-                    if (
-                        all_titles and para_text_len < 200
-                    ):  # total length of the paragraph is less than 200, more than this should not be a title
-                        para_block["is_block_title"] = 1
-                    else:
-                        para_block["is_block_title"] = 0
-
-                    all_name_or_org_list_to_be_removed = all(
-                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
-                    )
-                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
-                        para_block["is_block_an_author_or_org_list"] = 1
-                    else:
-                        para_block["is_block_an_author_or_org_list"] = 0
-
-        pdf_dic["statistics"]["num_titles"] = num_titles
-
-        return pdf_dic
-
-    def __determine_size_based_level(self, title_blocks):
-        """
-        This function determines the title level based on the font size of the title.
-
-        Parameters
-        ----------
-        title_blocks : list
-
-        Returns
-        -------
-        title_blocks : list
-        """
-
-        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
-
-        # Use the mean and std of font sizes to remove extreme values
-        mean_font_size = np.mean(font_sizes)
-        std_font_size = np.std(font_sizes)
-        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
-        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
-
-        # Compute the threshold for title level
-        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
-        if middle_font_sizes.size > 0:
-            middle_mean_font_size = np.mean(middle_font_sizes)
-            level_threshold = middle_mean_font_size
-        else:
-            level_threshold = mean_font_size
-
-        for tb in title_blocks:
-            title_block = tb["block"]
-            title_font_size = safe_get(title_block, "block_font_size", 0)
-
-            current_level = 1  # Initialize title level, the biggest level is 1
-
-            # print(f"Before adjustment by font size, {current_level}")
-            if title_font_size >= max_extreme_font_size:
-                current_level = 1
-            elif title_font_size <= min_extreme_font_size:
-                current_level = 3
-            elif float(title_font_size) >= float(level_threshold):
-                current_level = 2
-            else:
-                current_level = 3
-            # print(f"After adjustment by font size, {current_level}")
-
-            title_block["block_title_level"] = current_level
-
-        return title_blocks
-
-    def batch_process_blocks_recog_title_level(self, pdf_dic):
-        title_blocks = []
-
-        # Collect all titles
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = blocks.get("para_blocks", [])
-                for block in para_blocks:
-                    if block.get("is_block_title"):
-                        title_obj = {"page_id": page_id, "block": block}
-                        title_blocks.append(title_obj)
-
-        # Determine title level
-        if title_blocks:
-            # Determine title level based on font size
-            title_blocks = self.__determine_size_based_level(title_blocks)
-
-        return pdf_dic
--- a/magic_pdf/pdf_parse_union_core.py
+++ b/magic_pdf/pdf_parse_union_core.py
-import time
-
-from loguru import logger
-
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.config.ocr_content_type import ContentType
-from magic_pdf.layout.layout_sort import (LAYOUT_UNPROC, get_bboxes_layout,
-                                          get_columns_cnt_of_layout)
-from magic_pdf.libs.commons import fitz, get_delta_time
-from magic_pdf.libs.convert_utils import dict_to_list
-from magic_pdf.libs.hash_utils import compute_md5
-from magic_pdf.libs.local_math import float_equal
-from magic_pdf.model.magic_model import MagicModel
-from magic_pdf.para.para_split_v2 import para_split
-from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
-from magic_pdf.pre_proc.construct_page_dict import \
-    ocr_construct_page_component_v2
-from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
-from magic_pdf.pre_proc.equations_replace import (
-    combine_chars_to_pymudict, remove_chars_in_text_blocks,
-    replace_equations_in_textblock)
-from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
-    ocr_prepare_bboxes_for_layout_split
-from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
-                                               fix_block_spans,
-                                               fix_discarded_block,
-                                               sort_blocks_by_layout)
-from magic_pdf.pre_proc.ocr_span_list_modify import (
-    get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
-    remove_overlaps_min_spans)
-from magic_pdf.pre_proc.resolve_bbox_conflict import \
-    check_useful_block_horizontal_overlap
-
-
-def remove_horizontal_overlap_block_which_smaller(all_bboxes):
-    useful_blocks = []
-    for bbox in all_bboxes:
-        useful_blocks.append({'bbox': bbox[:4]})
-    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
-        check_useful_block_horizontal_overlap(useful_blocks)
-    )
-    if is_useful_block_horz_overlap:
-        logger.warning(
-            f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
-        )
-        for bbox in all_bboxes.copy():
-            if smaller_bbox == bbox[:4]:
-                all_bboxes.remove(bbox)
-
-    return is_useful_block_horz_overlap, all_bboxes
-
-
-def __replace_STX_ETX(text_str: str):
-    """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
-    Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
-
-        Args:
-            text_str (str): raw text
-
-        Returns:
-            _type_: replaced text
-    """
-    if text_str:
-        s = text_str.replace('\u0002', "'")
-        s = s.replace('\u0003', "'")
-        return s
-    return text_str
-
-
-def txt_spans_extract(pdf_page, inline_equations, interline_equations):
-    text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
-    char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
-        'blocks'
-    ]
-    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
-    text_blocks = replace_equations_in_textblock(
-        text_blocks, inline_equations, interline_equations
-    )
-    text_blocks = remove_citation_marker(text_blocks)
-    text_blocks = remove_chars_in_text_blocks(text_blocks)
-    spans = []
-    for v in text_blocks:
-        for line in v['lines']:
-            for span in line['spans']:
-                bbox = span['bbox']
-                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
-                    continue
-                if span.get('type') not in (
-                    ContentType.InlineEquation,
-                    ContentType.InterlineEquation,
-                ):
-                    spans.append(
-                        {
-                            'bbox': list(span['bbox']),
-                            'content': __replace_STX_ETX(span['text']),
-                            'type': ContentType.Text,
-                            'score': 1.0,
-                        }
-                    )
-    return spans
-
-
-def replace_text_span(pymu_spans, ocr_spans):
-    return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
-
-
-def parse_page_core(
-    pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
-):
-    need_drop = False
-    drop_reason = []
-
-    """从magic_model对象中获取后面会用到的区块信息"""
-    img_blocks = magic_model.get_imgs(page_id)
-    table_blocks = magic_model.get_tables(page_id)
-    discarded_blocks = magic_model.get_discarded(page_id)
-    text_blocks = magic_model.get_text_blocks(page_id)
-    title_blocks = magic_model.get_title_blocks(page_id)
-    inline_equations, interline_equations, interline_equation_blocks = (
-        magic_model.get_equations(page_id)
-    )
-
-    page_w, page_h = magic_model.get_page_size(page_id)
-
-    spans = magic_model.get_all_spans(page_id)
-
-    """根据parse_mode，构造spans"""
-    if parse_mode == 'txt':
-        """ocr 中文本类的 span 用 pymu spans 替换！"""
-        pymu_spans = txt_spans_extract(
-            pdf_docs[page_id], inline_equations, interline_equations
-        )
-        spans = replace_text_span(pymu_spans, spans)
-    elif parse_mode == 'ocr':
-        pass
-    else:
-        raise Exception('parse_mode must be txt or ocr')
-
-    """删除重叠spans中置信度较低的那些"""
-    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
-    """删除重叠spans中较小的那些"""
-    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
-    """对image和table截图"""
-    spans = ocr_cut_image_and_table(
-        spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter
-    )
-
-    """将所有区块的bbox整理到一起"""
-    # interline_equation_blocks参数不够准，后面切换到interline_equations上
-    interline_equation_blocks = []
-    if len(interline_equation_blocks) > 0:
-        all_bboxes, all_discarded_blocks, drop_reasons = (
-            ocr_prepare_bboxes_for_layout_split(
-                img_blocks,
-                table_blocks,
-                discarded_blocks,
-                text_blocks,
-                title_blocks,
-                interline_equation_blocks,
-                page_w,
-                page_h,
-            )
-        )
-    else:
-        all_bboxes, all_discarded_blocks, drop_reasons = (
-            ocr_prepare_bboxes_for_layout_split(
-                img_blocks,
-                table_blocks,
-                discarded_blocks,
-                text_blocks,
-                title_blocks,
-                interline_equations,
-                page_w,
-                page_h,
-            )
-        )
-
-    if len(drop_reasons) > 0:
-        need_drop = True
-        drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
-
-    """先处理不需要排版的discarded_blocks"""
-    discarded_block_with_spans, spans = fill_spans_in_blocks(
-        all_discarded_blocks, spans, 0.4
-    )
-    fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
-
-    """如果当前页面没有bbox则跳过"""
-    if len(all_bboxes) == 0:
-        logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
-        return ocr_construct_page_component_v2(
-            [],
-            [],
-            page_id,
-            page_w,
-            page_h,
-            [],
-            [],
-            [],
-            interline_equations,
-            fix_discarded_blocks,
-            need_drop,
-            drop_reason,
-        )
-
-    """在切分之前，先检查一下bbox是否有左右重叠的情况，如果有，那么就认为这个pdf暂时没有能力处理好，这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
-
-    while True:  # 循环检查左右重叠的情况，如果存在就删除掉较小的那个bbox，直到不存在左右重叠的情况
-        is_useful_block_horz_overlap, all_bboxes = (
-            remove_horizontal_overlap_block_which_smaller(all_bboxes)
-        )
-        if is_useful_block_horz_overlap:
-            need_drop = True
-            drop_reason.append(DropReason.USEFUL_BLOCK_HOR_OVERLAP)
-        else:
-            break
-
-    """根据区块信息计算layout"""
-    page_boundry = [0, 0, page_w, page_h]
-    layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
-
-    if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
-        logger.warning(
-            f'skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}'
-        )
-        need_drop = True
-        drop_reason.append(DropReason.CAN_NOT_DETECT_PAGE_LAYOUT)
-
-    """以下去掉复杂的布局和超过2列的布局"""
-    if any(
-        [lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]
-    ):  # 复杂的布局
-        logger.warning(
-            f'skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}'
-        )
-        need_drop = True
-        drop_reason.append(DropReason.COMPLICATED_LAYOUT)
-
-    layout_column_width = get_columns_cnt_of_layout(layout_tree)
-    if layout_column_width > 2:  # 去掉超过2列的布局pdf
-        logger.warning(
-            f'skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}'
-        )
-        need_drop = True
-        drop_reason.append(DropReason.TOO_MANY_LAYOUT_COLUMNS)
-
-    """根据layout顺序，对当前页面所有需要留下的block进行排序"""
-    sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
-
-    """将span填入排好序的blocks中"""
-    block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3)
-
-    """对block进行fix操作"""
-    fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
-
-    """获取QA需要外置的list"""
-    images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
-
-    """构造pdf_info_dict"""
-    page_info = ocr_construct_page_component_v2(
-        fix_blocks,
-        layout_bboxes,
-        page_id,
-        page_w,
-        page_h,
-        layout_tree,
-        images,
-        tables,
-        interline_equations,
-        fix_discarded_blocks,
-        need_drop,
-        drop_reason,
-    )
-    return page_info
-
-
-def pdf_parse_union(
-    pdf_bytes,
-    model_list,
-    imageWriter,
-    parse_mode,
-    start_page_id=0,
-    end_page_id=None,
-    debug_mode=False,
-):
-    pdf_bytes_md5 = compute_md5(pdf_bytes)
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-
-    """初始化空的pdf_info_dict"""
-    pdf_info_dict = {}
-
-    """用model_list和docs对象初始化magic_model"""
-    magic_model = MagicModel(model_list, pdf_docs)
-
-    """根据输入的起始范围解析pdf"""
-    # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
-    end_page_id = (
-        end_page_id
-        if end_page_id is not None and end_page_id >= 0
-        else len(pdf_docs) - 1
-    )
-
-    if end_page_id > len(pdf_docs) - 1:
-        logger.warning('end_page_id is out of range, use pdf_docs length')
-        end_page_id = len(pdf_docs) - 1
-
-    """初始化启动时间"""
-    start_time = time.time()
-
-    for page_id, page in enumerate(pdf_docs):
-        """debug时输出每页解析的耗时."""
-        if debug_mode:
-            time_now = time.time()
-            logger.info(
-                f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
-            )
-            start_time = time_now
-
-        """解析pdf中的每一页"""
-        if start_page_id <= page_id <= end_page_id:
-            page_info = parse_page_core(
-                pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
-            )
-        else:
-            page_w = page.rect.width
-            page_h = page.rect.height
-            page_info = ocr_construct_page_component_v2(
-                [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
-            )
-        pdf_info_dict[f'page_{page_id}'] = page_info
-
-    """分段"""
-    para_split(pdf_info_dict, debug_mode=debug_mode)
-
-    """dict转list"""
-    pdf_info_list = dict_to_list(pdf_info_dict)
-    new_pdf_info_dict = {
-        'pdf_info': pdf_info_list,
-    }
-
-    return new_pdf_info_dict
-
-
-if __name__ == '__main__':
-    pass
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -5,19 +5,18 @@ import time
 from typing import List

 import torch
+import fitz
 from loguru import logger

-from magic_pdf.config.drop_reason import DropReason
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.data.dataset import Dataset, PageableData
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from magic_pdf.libs.clean_memory import clean_memory
-from magic_pdf.libs.commons import fitz, get_delta_time
 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
-from magic_pdf.libs.local_math import float_equal
+
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
 from magic_pdf.model.magic_model import MagicModel

@@ -34,13 +33,11 @@ except ImportError:
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton

 from magic_pdf.para.para_split_v3 import para_split
-from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
+
 from magic_pdf.pre_proc.construct_page_dict import \
    ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
-from magic_pdf.pre_proc.equations_replace import (
-    combine_chars_to_pymudict, remove_chars_in_text_blocks,
-    replace_equations_in_textblock)
+
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
    ocr_prepare_bboxes_for_layout_split_v2
 from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
@@ -49,26 +46,6 @@ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
 from magic_pdf.pre_proc.ocr_span_list_modify import (
    get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
    remove_overlaps_min_spans)
-from magic_pdf.pre_proc.resolve_bbox_conflict import \
-    check_useful_block_horizontal_overlap
-
-
-def remove_horizontal_overlap_block_which_smaller(all_bboxes):
-    useful_blocks = []
-    for bbox in all_bboxes:
-        useful_blocks.append({'bbox': bbox[:4]})
-    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
-        check_useful_block_horizontal_overlap(useful_blocks)
-    )
-    if is_useful_block_horz_overlap:
-        logger.warning(
-            f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
-        )  # noqa: E501
-        for bbox in all_bboxes.copy():
-            if smaller_bbox == bbox[:4]:
-                all_bboxes.remove(bbox)
-
-    return is_useful_block_horz_overlap, all_bboxes


 def __replace_STX_ETX(text_str: str):
@@ -89,28 +66,25 @@ def __replace_STX_ETX(text_str: str):


 def chars_to_content(span):
-        # # 先给chars按char['bbox']的x坐标排序
-        # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
-
+    # 检查span中的char是否为空
+    if len(span['chars']) == 0:
+        span['content'] = ''
+    else:
        # 先给chars按char['bbox']的中心点的x坐标排序
        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
-        content = ''

        # 求char的平均宽度
-        if len(span['chars']) == 0:
-            span['content'] = content
-            del span['chars']
-            return
-        else:
        char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
        char_avg_width = char_width_sum / len(span['chars'])

+        content = ''
        for char in span['chars']:
            # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度，则需要在中间插入一个空格
            if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
                content += ' '
            content += char['c']
        span['content'] = __replace_STX_ETX(content)
+
    del span['chars']


@@ -128,8 +102,13 @@ def fill_char_in_spans(spans, all_chars):
                span['chars'].append(char)
                break

+    empty_spans = []
+
    for span in spans:
        chars_to_content(span)
+        if len(span['content']) == 0:
+            empty_spans.append(span)
+    return empty_spans


 # 使用鲁棒性更强的中心点坐标判断
@@ -162,48 +141,79 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):

 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):

+    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+
+    # @todo: 拿到char之后把倾斜角度较大的先删一遍
+    all_pymu_chars = []
+    for block in text_blocks_raw:
+        for line in block['lines']:
+            for span in line['spans']:
+                all_pymu_chars.extend(span['chars'])
+
+    # 计算所有sapn的高度的中位数
+    span_height_list = []
+    for span in spans:
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            continue
+        span_height = span['bbox'][3] - span['bbox'][1]
+        span['height'] = span_height
+        span_height_list.append(span_height)
+    if len(span_height_list) == 0:
+        return spans
+    else:
+        median_span_height = statistics.median(span_height_list)
+
    useful_spans = []
    unuseful_spans = []
+    # 纵向span的两个特征：1. 高度超过多个line 2. 高宽比超过某个值
+    vertical_spans = []
    for span in spans:
-        for block in all_bboxes:
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            continue
+        for block in all_bboxes + all_discarded_blocks:
            if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
                continue
-            else:
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
+                if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3:
+                    vertical_spans.append(span)
+                elif block in all_bboxes:
                    useful_spans.append(span)
-                    break
-        for block in all_discarded_blocks:
-            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
+                else:
                    unuseful_spans.append(span)
-                break

-    text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+                del span['height']

-    # @todo: 拿到char之后把倾斜角度较大的先删一遍
-    all_pymu_chars = []
+                break
+
+    """垂直的span框直接用pymu的line进行填充"""
+    if len(vertical_spans) > 0:
+        text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+        all_pymu_lines = []
        for block in text_blocks:
            for line in block['lines']:
-            for span in line['spans']:
-                all_pymu_chars.extend(span['chars'])
+                all_pymu_lines.append(line)

-    new_spans = []
+        for pymu_line in all_pymu_lines:
+            for span in vertical_spans:
+                if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
+                    for pymu_span in pymu_line['spans']:
+                        span['content'] += pymu_span['text']
+                    break

-    for span in useful_spans:
-        if span['type'] in [ContentType.Text]:
-            span['chars'] = []
-            new_spans.append(span)
+        for span in vertical_spans:
+            if len(span['content']) == 0:
+                spans.remove(span)
+
+    """水平的span框如果没有char则用ocr进行填充"""
+    new_spans = []

-    for span in unuseful_spans:
+    for span in useful_spans + unuseful_spans:
        if span['type'] in [ContentType.Text]:
            span['chars'] = []
            new_spans.append(span)

-    fill_char_in_spans(new_spans, all_pymu_chars)
+    empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)

-    empty_spans = []
-    for span in new_spans:
-        if len(span['content']) == 0:
-            empty_spans.append(span)
    if len(empty_spans) > 0:

        # 初始化ocr模型
@@ -216,52 +226,18 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
        )

        for span in empty_spans:
-            spans.remove(span)
-            # 对span的bbox截图
+            # 对span的bbox截图再ocr
            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
            ocr_res = ocr_model.ocr(span_img, det=False)
-            # logger.info(f"ocr_res: {ocr_res}")
-            # logger.info(f"empty_span: {span}")
            if ocr_res and len(ocr_res) > 0:
                if len(ocr_res[0]) > 0:
                    ocr_text, ocr_score = ocr_res[0][0]
                    if ocr_score > 0.5 and len(ocr_text) > 0:
                        span['content'] = ocr_text
-                            spans.append(span)
-
-    return spans
-
+                        span['score'] = ocr_score
+                    else:
+                        spans.remove(span)

-def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
-    text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
-    char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
-        'blocks'
-    ]
-    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
-    text_blocks = replace_equations_in_textblock(
-        text_blocks, inline_equations, interline_equations
-    )
-    text_blocks = remove_citation_marker(text_blocks)
-    text_blocks = remove_chars_in_text_blocks(text_blocks)
-    spans = []
-    for v in text_blocks:
-        for line in v['lines']:
-            for span in line['spans']:
-                bbox = span['bbox']
-                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
-                    continue
-                if span.get('type') not in (
-                    ContentType.InlineEquation,
-                    ContentType.InterlineEquation,
-                ):
-                    spans.append(
-                        {
-                            'bbox': list(span['bbox']),
-                            'content': __replace_STX_ETX(span['text']),
-                            'type': ContentType.Text,
-                            'score': 1.0,
-                        }
-                    )
    return spans


@@ -682,6 +658,23 @@ def parse_page_core(
    """顺便删除大水印并保留abandon的span"""
    spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)

+    """删除重叠spans中置信度较低的那些"""
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    """删除重叠spans中较小的那些"""
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+
+    """根据parse_mode，构造spans，主要是文本类的字符填充"""
+    if parse_mode == SupportedPdfParseMethod.TXT:
+
+        """使用新版本的混合ocr方案"""
+        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
+
+    elif parse_mode == SupportedPdfParseMethod.OCR:
+        pass
+    else:
+        raise Exception('parse_mode must be txt or ocr')
+
+
    """先处理不需要排版的discarded_blocks"""
    discarded_block_with_spans, spans = fill_spans_in_blocks(
        all_discarded_blocks, spans, 0.4
@@ -706,26 +699,6 @@ def parse_page_core(
            drop_reason,
        )

-    """删除重叠spans中置信度较低的那些"""
-    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
-    """删除重叠spans中较小的那些"""
-    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
-
-    """根据parse_mode，构造spans，主要是文本类的字符填充"""
-    if parse_mode == SupportedPdfParseMethod.TXT:
-
-        """之前的公式替换方案"""
-        # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
-        # spans = replace_text_span(pymu_spans, spans)
-
-        """ocr 中文本类的 span 用 pymu spans 替换！"""
-        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
-
-    elif parse_mode == SupportedPdfParseMethod.OCR:
-        pass
-    else:
-        raise Exception('parse_mode must be txt or ocr')
-
    """对image和table截图"""
    spans = ocr_cut_image_and_table(
        spans, page_doc, page_id, pdf_bytes_md5, imageWriter
@@ -811,7 +784,7 @@ def pdf_parse_union(
        if debug_mode:
            time_now = time.time()
            logger.info(
-                f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
+                f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
            )
            start_time = time_now


--- a/magic_pdf/post_proc/__init__.py
+++ b/magic_pdf/post_proc/__init__.py
--- a/magic_pdf/post_proc/detect_para.py
+++ b/magic_pdf/post_proc/detect_para.py
-import os
-import sys
-import json
-import re
-import math
-import unicodedata
-from collections import Counter
-
-
-import numpy as np
-from termcolor import cprint
-
-
-from magic_pdf.libs.commons import fitz
-from magic_pdf.libs.nlp_utils import NLPModels
-
-
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-
-
-def open_pdf(pdf_path):
-    try:
-        pdf_document = fitz.open(pdf_path)  # type: ignore
-        return pdf_document
-    except Exception as e:
-        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
-        raise e
-
-
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-
-
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-
-
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-
-
-def print_yellow(text):
-    print()
-    cprint(text, "yellow", attrs=["bold"], end="\n\n")
-
-
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-
-
-def is_bbox_overlap(bbox1, bbox2):
-    """
-    This function checks if bbox1 and bbox2 overlap or not
-
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-
-    Returns
-    -------
-    bool
-        True if bbox1 and bbox2 overlap, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-
-    if x0_1 > x1_2 or x0_2 > x1_1:
-        return False
-    if y0_1 > y1_2 or y0_2 > y1_1:
-        return False
-
-    return True
-
-
-def is_in_bbox(bbox1, bbox2):
-    """
-    This function checks if bbox1 is in bbox2
-
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-
-    Returns
-    -------
-    bool
-        True if bbox1 is in bbox2, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-
-    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
-        return True
-    else:
-        return False
-
-
-def calculate_para_bbox(lines):
-    """
-    This function calculates the minimum bbox of the paragraph
-
-    Parameters
-    ----------
-    lines : list
-        lines
-
-    Returns
-    -------
-    para_bbox : list
-        bbox of the paragraph
-    """
-    x0 = min(line["bbox"][0] for line in lines)
-    y0 = min(line["bbox"][1] for line in lines)
-    x1 = max(line["bbox"][2] for line in lines)
-    y1 = max(line["bbox"][3] for line in lines)
-    return [x0, y0, x1, y1]
-
-
-def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is right aligned from its neighbors
-
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-
-    Returns
-    -------
-    bool
-        True if the line is right aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-
-    _, _, x1, _ = curr_line_bbox
-    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-
-    if direction == 0:
-        return abs(x1 - prev_x1) < horizontal_thres
-    elif direction == 1:
-        return abs(x1 - next_x1) < horizontal_thres
-    elif direction == 2:
-        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
-    else:
-        return False
-
-
-def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is left aligned from its neighbors
-
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-
-    Returns
-    -------
-    bool
-        True if the line is left aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-
-    x0, _, _, _ = curr_line_bbox
-    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-
-    if direction == 0:
-        return abs(x0 - prev_x0) < horizontal_thres
-    elif direction == 1:
-        return abs(x0 - next_x0) < horizontal_thres
-    elif direction == 2:
-        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
-    else:
-        return False
-
-
-def end_with_punctuation(line_text):
-    """
-    This function checks if the line ends with punctuation marks
-    """
-
-    english_end_puncs = [".", "?", "!"]
-    chinese_end_puncs = ["。", "？", "！"]
-    end_puncs = english_end_puncs + chinese_end_puncs
-
-    last_non_space_char = None
-    for ch in line_text[::-1]:
-        if not ch.isspace():
-            last_non_space_char = ch
-            break
-
-    if last_non_space_char is None:
-        return False
-
-    return last_non_space_char in end_puncs
-
-
-def is_nested_list(lst):
-    if isinstance(lst, list):
-        return any(isinstance(sub, list) for sub in lst)
-    return False
-
-
-class DenseSingleLineBlockException(Exception):
-    """
-    This class defines the exception type for dense single line-block.
-    """
-
-    def __init__(self, message="DenseSingleLineBlockException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class TitleDetectionException(Exception):
-    """
-    This class defines the exception type for title detection.
-    """
-
-    def __init__(self, message="TitleDetectionException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class TitleLevelException(Exception):
-    """
-    This class defines the exception type for title level.
-    """
-
-    def __init__(self, message="TitleLevelException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class ParaSplitException(Exception):
-    """
-    This class defines the exception type for paragraph splitting.
-    """
-
-    def __init__(self, message="ParaSplitException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class ParaMergeException(Exception):
-    """
-    This class defines the exception type for paragraph merging.
-    """
-
-    def __init__(self, message="ParaMergeException"):
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return f"{self.message}"
-
-    def __repr__(self):
-        return f"{self.message}"
-
-
-class DiscardByException:
-    """
-    This class discards pdf files by exception
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
-        """
-        This function discards pdf files by single line block exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        exception_page_nums = 0
-        page_num = 0
-        for page_id, page in pdf_dic.items():
-            if page_id.startswith("page_"):
-                page_num += 1
-                if "preproc_blocks" in page.keys():
-                    preproc_blocks = page["preproc_blocks"]
-
-                    all_single_line_blocks = []
-                    for block in preproc_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-
-                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
-                        exception_page_nums += 1
-
-        if page_num == 0:
-            return None
-
-        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
-            return exception.message
-
-        return None
-
-    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
-        """
-        This function discards pdf files by title detection exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-
-    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
-        """
-        This function discards pdf files by title level exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-
-    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
-        """
-        This function discards pdf files by split para exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-
-    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
-        """
-        This function discards pdf files by merge para exception
-
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-
-
-class LayoutFilterProcessor:
-    def __init__(self) -> None:
-        pass
-
-    def batch_process_blocks(self, pdf_dict):
-        """
-        This function processes the blocks in batch.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-
-        pdf_dict : dict
-            pdf dictionary
-
-        Returns
-        -------
-        pdf_dict : dict
-            pdf dictionary
-        """
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
-                    layout_bbox_objs = blocks["layout_bboxes"]
-                    if layout_bbox_objs is None:
-                        continue
-                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
-
-                    # Enlarge each value of x0, y0, x1, y1 for each layout_bbox to prevent loss of text.
-                    layout_bboxes = [
-                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
-                    ]
-
-                    para_blocks = blocks["para_blocks"]
-                    if para_blocks is None:
-                        continue
-
-                    for lb_bbox in layout_bboxes:
-                        for i, para_block in enumerate(para_blocks):
-                            para_bbox = para_block["bbox"]
-                            para_blocks[i]["in_layout"] = 0
-                            if is_in_bbox(para_bbox, lb_bbox):
-                                para_blocks[i]["in_layout"] = 1
-
-                    blocks["para_blocks"] = para_blocks
-
-        return pdf_dict
-
-
-class RawBlockProcessor:
-    def __init__(self) -> None:
-        self.y_tolerance = 2
-        self.pdf_dic = {}
-
-    def __span_flags_decomposer(self, span_flags):
-        """
-        Make font flags human readable.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-
-        span_flags : int
-            span flags
-
-        Returns
-        -------
-        l : dict
-            decomposed flags
-        """
-
-        l = {
-            "is_superscript": False,
-            "is_italic": False,
-            "is_serifed": False,
-            "is_sans_serifed": False,
-            "is_monospaced": False,
-            "is_proportional": False,
-            "is_bold": False,
-        }
-
-        if span_flags & 2**0:
-            l["is_superscript"] = True  # 表示上标
-
-        if span_flags & 2**1:
-            l["is_italic"] = True  # 表示斜体
-
-        if span_flags & 2**2:
-            l["is_serifed"] = True  # 表示衬线字体
-        else:
-            l["is_sans_serifed"] = True  # 表示非衬线字体
-
-        if span_flags & 2**3:
-            l["is_monospaced"] = True  # 表示等宽字体
-        else:
-            l["is_proportional"] = True  # 表示比例字体
-
-        if span_flags & 2**4:
-            l["is_bold"] = True  # 表示粗体
-
-        return l
-
-    def __make_new_lines(self, raw_lines):
-        """
-        This function makes new lines.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-
-        raw_lines : list
-            raw lines
-
-        Returns
-        -------
-        new_lines : list
-            new lines
-        """
-        new_lines = []
-        new_line = None
-
-        for raw_line in raw_lines:
-            raw_line_bbox = raw_line["bbox"]
-            raw_line_spans = raw_line["spans"]
-            raw_line_text = "".join([span["text"] for span in raw_line_spans])
-            raw_line_dir = raw_line.get("dir", None)
-
-            decomposed_line_spans = []
-            for span in raw_line_spans:
-                raw_flags = span["flags"]
-                decomposed_flags = self.__span_flags_decomposer(raw_flags)
-                span["decomposed_flags"] = decomposed_flags
-                decomposed_line_spans.append(span)
-
-            if new_line is None:  # Handle the first line
-                new_line = {
-                    "bbox": raw_line_bbox,
-                    "text": raw_line_text,
-                    "dir": raw_line_dir if raw_line_dir else (0, 0),
-                    "spans": decomposed_line_spans,
-                }
-            else:  # Handle the rest lines
-                if (
-                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
-                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
-                ):
-                    new_line["bbox"] = (
-                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
-                        new_line["bbox"][1],  # top
-                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
-                        raw_line_bbox[3],  # bottom
-                    )
-                    new_line["text"] += raw_line_text
-                    new_line["spans"].extend(raw_line_spans)
-                    new_line["dir"] = (
-                        new_line["dir"][0] + raw_line_dir[0],
-                        new_line["dir"][1] + raw_line_dir[1],
-                    )
-                else:
-                    new_lines.append(new_line)
-                    new_line = {
-                        "bbox": raw_line_bbox,
-                        "text": raw_line_text,
-                        "dir": raw_line_dir if raw_line_dir else (0, 0),
-                        "spans": raw_line_spans,
-                    }
-        if new_line:
-            new_lines.append(new_line)
-
-        return new_lines
-
-    def __make_new_block(self, raw_block):
-        """
-        This function makes a new block.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        raw_block : dict
-            a raw block
-
-        Returns
-        -------
-        new_block : dict
-        """
-        new_block = {}
-
-        block_id = raw_block["number"]
-        block_bbox = raw_block["bbox"]
-        block_text = "".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
-        raw_lines = raw_block["lines"]
-        block_lines = self.__make_new_lines(raw_lines)
-
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["lines"] = block_lines
-
-        return new_block
-
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks.
-
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "preproc_blocks" in blocks.keys():
-                    input_blocks = blocks["preproc_blocks"]
-                    for raw_block in input_blocks:
-                        new_block = self.__make_new_block(raw_block)
-                        para_blocks.append(new_block)
-
-                blocks["para_blocks"] = para_blocks
-
-        return pdf_dic
-
-
-class BlockStatisticsCalculator:
-    """
-    This class calculates the statistics of the block.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __calc_stats_of_new_lines(self, new_lines):
-        """
-        This function calculates the paragraph metrics
-
-        Parameters
-        ----------
-        combined_lines : list
-            combined lines
-
-        Returns
-        -------
-        X0 : float
-            Median of x0 values, which represents the left average boundary of the block
-        X1 : float
-            Median of x1 values, which represents the right average boundary of the block
-        avg_char_width : float
-            Average of char widths, which represents the average char width of the block
-        avg_char_height : float
-            Average of line heights, which represents the average line height of the block
-
-        """
-        x0_values = []
-        x1_values = []
-        char_widths = []
-        char_heights = []
-
-        block_font_types = []
-        block_font_sizes = []
-        block_directions = []
-
-        if len(new_lines) > 0:
-            for i, line in enumerate(new_lines):
-                line_bbox = line["bbox"]
-                line_text = line["text"]
-                line_spans = line["spans"]
-
-                num_chars = len([ch for ch in line_text if not ch.isspace()])
-
-                x0_values.append(line_bbox[0])
-                x1_values.append(line_bbox[2])
-
-                if num_chars > 0:
-                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
-                    char_widths.append(char_width)
-
-                for span in line_spans:
-                    block_font_types.append(span["font"])
-                    block_font_sizes.append(span["size"])
-
-                if "dir" in line:
-                    block_directions.append(line["dir"])
-
-                # line_font_types = [span["font"] for span in line_spans]
-                char_heights = [span["size"] for span in line_spans]
-
-        X0 = np.median(x0_values) if x0_values else 0
-        X1 = np.median(x1_values) if x1_values else 0
-        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
-        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
-
-        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
-
-        max_span_length = 0
-        max_span_font_type = None
-        for line in new_lines:
-            line_spans = line["spans"]
-            for span in line_spans:
-                span_length = span["bbox"][2] - span["bbox"][0]
-                if span_length > max_span_length:
-                    max_span_length = span_length
-                    max_span_font_type = span["font"]
-
-        max_freq_font_type = max_span_font_type
-
-        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
-
-        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
-        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
-
-        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
-
-        return (
-            X0,
-            X1,
-            avg_char_width,
-            avg_char_height,
-            max_freq_font_type,
-            avg_font_size,
-            (avg_dir_horizontal, avg_dir_vertical),
-            median_font_size,
-        )
-
-    def __make_new_block(self, input_block):
-        new_block = {}
-
-        raw_lines = input_block["lines"]
-        stats = self.__calc_stats_of_new_lines(raw_lines)
-
-        block_id = input_block["block_id"]
-        block_bbox = input_block["bbox"]
-        block_text = input_block["text"]
-        block_lines = raw_lines
-        block_avg_left_boundary = stats[0]
-        block_avg_right_boundary = stats[1]
-        block_avg_char_width = stats[2]
-        block_avg_char_height = stats[3]
-        block_font_type = stats[4]
-        block_font_size = stats[5]
-        block_direction = stats[6]
-        block_median_font_size = stats[7]
-
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["dir"] = block_direction
-        new_block["X0"] = block_avg_left_boundary
-        new_block["X1"] = block_avg_right_boundary
-        new_block["avg_char_width"] = block_avg_char_width
-        new_block["avg_char_height"] = block_avg_char_height
-        new_block["block_font_type"] = block_font_type
-        new_block["block_font_size"] = block_font_size
-        new_block["lines"] = block_lines
-        new_block["median_font_size"] = block_median_font_size
-
-        return new_block
-
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks.
-            Schema can refer to the value of key ""preproc_blocks".
-
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self.__make_new_block(input_block)
-                        para_blocks.append(new_block)
-
-                blocks["para_blocks"] = para_blocks
-
-        return pdf_dic
-
-
-class DocStatisticsCalculator:
-    """
-    This class calculates the statistics of the document.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def calc_stats_of_doc(self, pdf_dict):
-        """
-        This function computes the statistics of the document
-
-        Parameters
-        ----------
-        result_dict : dict
-            result dictionary
-
-        Returns
-        -------
-        statistics : dict
-            statistics of the document
-        """
-
-        total_text_length = 0
-        total_num_blocks = 0
-
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    para_blocks = blocks["para_blocks"]
-                    for para_block in para_blocks:
-                        total_text_length += len(para_block["text"])
-                        total_num_blocks += 1
-
-        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
-
-        font_list = []
-
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        block_text_length = len(input_block.get("text", ""))
-                        if block_text_length < avg_text_length * 0.5:
-                            continue
-                        block_font_type = safe_get(input_block, "block_font_type", "")
-                        block_font_size = safe_get(input_block, "block_font_size", 0)
-                        font_list.append((block_font_type, block_font_size))
-
-        font_counter = Counter(font_list)
-        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
-        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
-
-        statistics = {
-            "num_pages": 0,
-            "num_blocks": 0,
-            "num_paras": 0,
-            "num_titles": 0,
-            "num_header_blocks": 0,
-            "num_footer_blocks": 0,
-            "num_watermark_blocks": 0,
-            "num_vertical_margin_note_blocks": 0,
-            "most_common_font_type": most_common_font[0][0],
-            "most_common_font_size": most_common_font[0][1],
-            "number_of_most_common_font": most_common_font[1],
-            "second_most_common_font_type": second_most_common_font[0][0],
-            "second_most_common_font_size": second_most_common_font[0][1],
-            "number_of_second_most_common_font": second_most_common_font[1],
-            "avg_text_length": avg_text_length,
-        }
-
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                blocks = pdf_dict[page_id]["para_blocks"]
-                statistics["num_pages"] += 1
-                for block_id, block_data in enumerate(blocks):
-                    statistics["num_blocks"] += 1
-
-                    if "paras" in block_data.keys():
-                        statistics["num_paras"] += len(block_data["paras"])
-
-                    for line in block_data["lines"]:
-                        if line.get("is_title", 0):
-                            statistics["num_titles"] += 1
-
-                    if block_data.get("is_header", 0):
-                        statistics["num_header_blocks"] += 1
-                    if block_data.get("is_footer", 0):
-                        statistics["num_footer_blocks"] += 1
-                    if block_data.get("is_watermark", 0):
-                        statistics["num_watermark_blocks"] += 1
-                    if block_data.get("is_vertical_margin_note", 0):
-                        statistics["num_vertical_margin_note_blocks"] += 1
-
-        pdf_dict["statistics"] = statistics
-
-        return pdf_dict
-
-
-class TitleProcessor:
-    """
-    This class processes the title.
-    """
-
-    def __init__(self, *doc_statistics) -> None:
-        if len(doc_statistics) > 0:
-            self.doc_statistics = doc_statistics[0]
-
-        self.nlp_model = NLPModels()
-        self.MAX_TITLE_LEVEL = 3
-        self.numbered_title_pattern = r"""
-            ^                                 # 行首
-            (                                 # 开始捕获组
-                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
-                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
-                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
-                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
-                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
-                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
-                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
-                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
-                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
-                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
-                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
-                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
-                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
-                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
-                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
-                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
-                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
-            )                                 # 结束捕获组
-            .+                                # 标题的其余部分
-        """
-
-    def _is_potential_title(
-        self,
-        curr_line,
-        prev_line,
-        prev_line_is_title,
-        next_line,
-        avg_char_width,
-        avg_char_height,
-        median_font_size,
-    ):
-        """
-        This function checks if the line is a potential title.
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        avg_char_width : float
-            average of char widths
-        avg_char_height : float
-            average of line heights
-
-        Returns
-        -------
-        bool
-            True if the line is a potential title, False otherwise.
-        """
-
-        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
-            """
-            This function checks if the line is centered on the page
-
-            Parameters
-            ----------
-            line_bbox : list
-                bbox of the line
-            page_bbox : list
-                bbox of the page
-            avg_char_width : float
-                average of char widths
-
-            Returns
-            -------
-            bool
-                True if the line is centered on the page, False otherwise.
-            """
-            horizontal_ratio = 0.5
-            horizontal_thres = horizontal_ratio * avg_char_width
-
-            x0, _, x1, _ = line_bbox
-            _, _, page_x1, _ = page_bbox
-
-            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
-
-        def __is_bold_font_line(line):
-            """
-            Check if a line contains any bold font style.
-            """
-
-            def _is_bold_span(span):
-                # if span text is empty or only contains space, return False
-                if not span["text"].strip():
-                    return False
-
-                return bool(span["flags"] & 2**4)  # Check if the font is bold
-
-            for span in line["spans"]:
-                if not _is_bold_span(span):
-                    return False
-
-            return True
-
-        def __is_italic_font_line(line):
-            """
-            Check if a line contains any italic font style.
-            """
-
-            def __is_italic_span(span):
-                return bool(span["flags"] & 2**1)  # Check if the font is italic
-
-            for span in line["spans"]:
-                if not __is_italic_span(span):
-                    return False
-
-            return True
-
-        def __is_punctuation_heavy(line_text):
-            """
-            Check if the line contains a high ratio of punctuation marks, which may indicate
-            that the line is not a title.
-
-            Parameters:
-            line_text (str): Text of the line.
-
-            Returns:
-            bool: True if the line is heavy with punctuation, False otherwise.
-            """
-            # Pattern for common title format like "X.Y. Title"
-            pattern = r"\b\d+\.\d+\..*\b"
-
-            # If the line matches the title format, return False
-            if re.match(pattern, line_text.strip()):
-                return False
-
-            # Find all punctuation marks in the line
-            punctuation_marks = re.findall(r"[^\w\s]", line_text)
-            number_of_punctuation_marks = len(punctuation_marks)
-
-            text_length = len(line_text)
-
-            if text_length == 0:
-                return False
-
-            punctuation_ratio = number_of_punctuation_marks / text_length
-            if punctuation_ratio >= 0.1:
-                return True
-
-            return False
-
-        def __has_mixed_font_styles(spans, strict_mode=False):
-            """
-            This function checks if the line has mixed font styles, the strict mode will compare the font types
-
-            Parameters
-            ----------
-            spans : list
-                spans of the line
-            strict_mode : bool
-                True for strict mode, the font types will be fully compared
-                False for non-strict mode, the font types will be compared by the most longest common prefix
-
-            Returns
-            -------
-            bool
-                True if the line has mixed font styles, False otherwise.
-            """
-            if strict_mode:
-                font_styles = set()
-                for span in spans:
-                    font_style = span["font"].lower()
-                    font_styles.add(font_style)
-
-                return len(font_styles) > 1
-
-            else:  # non-strict mode
-                font_styles = []
-                for span in spans:
-                    font_style = span["font"].lower()
-                    font_styles.append(font_style)
-
-                if len(font_styles) > 1:
-                    longest_common_prefix = os.path.commonprefix(font_styles)
-                    if len(longest_common_prefix) > 0:
-                        return False
-                    else:
-                        return True
-                else:
-                    return False
-
-        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
-            """
-            This function checks if the current line has a different font type from the previous and next lines
-
-            Parameters
-            ----------
-            curr_line_font_type : str
-                font type of the current line
-            prev_line_font_type : str
-                font type of the previous line
-            next_line_font_type : str
-                font type of the next line
-
-            Returns
-            -------
-            bool
-                True if the current line has a different font type from the previous and next lines, False otherwise.
-            """
-            return all(
-                curr_line_font_type != other_font_type.lower()
-                for other_font_type in [prev_line_font_type, next_line_font_type]
-                if other_font_type is not None
-            )
-
-        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
-            """
-            This function checks if the current line has a larger font size than the previous and next lines
-
-            Parameters
-            ----------
-            curr_line_font_size : float
-                font size of the current line
-            prev_line_font_size : float
-                font size of the previous line
-            next_line_font_size : float
-                font size of the next line
-
-            Returns
-            -------
-            bool
-                True if the current line has a larger font size than the previous and next lines, False otherwise.
-            """
-            return all(
-                curr_line_font_size > other_font_size * 1.2
-                for other_font_size in [prev_line_font_size, next_line_font_size]
-                if other_font_size is not None
-            )
-
-        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
-            """
-            This function checks if the current line is similar to the previous line
-
-            Parameters
-            ----------
-            curr_line : dict
-                current line
-            prev_line : dict
-                previous line
-
-            Returns
-            -------
-            bool
-                True if the current line is similar to the previous line, False otherwise.
-            """
-
-            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
-                return True
-            else:
-                return False
-
-        def __is_same_font_type_of_docAvg(curr_line_font_type):
-            """
-            This function checks if the current line has the same font type as the document average font type
-
-            Parameters
-            ----------
-            curr_line_font_type : str
-                font type of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line has the same font type as the document average font type, False otherwise.
-            """
-            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
-            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
-
-            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
-
-        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
-            """
-            This function checks if the current line has a large enough font size
-
-            Parameters
-            ----------
-            curr_line_font_size : float
-                font size of the current line
-            ratio : float
-                ratio of the current line font size to the document average font size
-
-            Returns
-            -------
-            bool
-                True if the current line has a large enough font size, False otherwise.
-            """
-            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
-            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
-            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
-
-            return curr_line_font_size >= doc_avg_font_size * ratio
-
-        def __is_sufficient_spacing_above_and_below(
-            curr_line_bbox,
-            prev_line_bbox,
-            next_line_bbox,
-            avg_char_height,
-            median_font_size,
-        ):
-            """
-            This function checks if the current line has sufficient spacing above and below
-
-            Parameters
-            ----------
-            curr_line_bbox : list
-                bbox of the current line
-            prev_line_bbox : list
-                bbox of the previous line
-            next_line_bbox : list
-                bbox of the next line
-            avg_char_width : float
-                average of char widths
-            avg_char_height : float
-                average of line heights
-
-            Returns
-            -------
-            bool
-                True if the current line has sufficient spacing above and below, False otherwise.
-            """
-            vertical_ratio = 1.25
-            vertical_thres = vertical_ratio * median_font_size
-
-            _, y0, _, y1 = curr_line_bbox
-
-            sufficient_spacing_above = False
-            if prev_line_bbox:
-                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
-                sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            else:
-                sufficient_spacing_above = True
-
-            sufficient_spacing_below = False
-            if next_line_bbox:
-                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
-                sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            else:
-                sufficient_spacing_below = True
-
-            return (sufficient_spacing_above, sufficient_spacing_below)
-
-        def __is_word_list_line_by_rules(curr_line_text):
-            """
-            This function checks if the current line is a word list
-
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line is a name list, False otherwise.
-            """
-            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
-            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
-
-            compiled_pattern = re.compile(name_list_pattern)
-
-            if compiled_pattern.search(curr_line_text):
-                return True
-            else:
-                return False
-
-        def __get_text_catgr_by_nlp(curr_line_text):
-            """
-            This function checks if the current line is a name list using nlp model, such as spacy
-
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line is a name list, False otherwise.
-            """
-
-            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
-
-            return result
-
-        def __is_numbered_title(curr_line_text):
-            """
-            This function checks if the current line is a numbered list
-
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line is a numbered list, False otherwise.
-            """
-
-            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
-
-            if compiled_pattern.search(curr_line_text):
-                return True
-            else:
-                return False
-
-        def __is_end_with_ending_puncs(line_text):
-            """
-            This function checks if the current line ends with a ending punctuation mark
-
-            Parameters
-            ----------
-            line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line ends with a punctuation mark, False otherwise.
-            """
-            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-
-            line_text = line_text.rstrip()
-            if line_text[-1] in end_puncs:
-                return True
-
-            return False
-
-        def __contains_only_no_meaning_symbols(line_text):
-            """
-            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
-            Situation contains:
-            1. Only have punctuation marks
-            2. Only have other non-meaning symbols
-
-            Parameters
-            ----------
-            line_text : str
-                text of the current line
-
-            Returns
-            -------
-            bool
-                True if the current line contains only symbols that have no meaning, False otherwise.
-            """
-
-            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
-            number_of_punctuation_marks = len(punctuation_marks)
-
-            text_length = len(line_text)
-
-            if text_length == 0:
-                return False
-
-            punctuation_ratio = number_of_punctuation_marks / text_length
-            if punctuation_ratio >= 0.9:
-                return True
-
-            return False
-
-        def __is_equation(line_text):
-            """
-            This function checks if the current line is an equation.
-
-            Parameters
-            ----------
-            line_text : str
-
-            Returns
-            -------
-            bool
-                True if the current line is an equation, False otherwise.
-            """
-            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
-
-            if re.search(equation_reg, line_text):
-                return True
-            else:
-                return False
-
-        def __is_title_by_len(text, max_length=200):
-            """
-            This function checks if the current line is a title by length.
-
-            Parameters
-            ----------
-            text : str
-                text of the current line
-
-            max_length : int
-                max length of the title
-
-            Returns
-            -------
-            bool
-                True if the current line is a title, False otherwise.
-
-            """
-            text = text.strip()
-            return len(text) <= max_length
-
-        def __compute_line_font_type_and_size(curr_line):
-            """
-            This function computes the font type and font size of the line.
-
-            Parameters
-            ----------
-            line : dict
-                line
-
-            Returns
-            -------
-            font_type : str
-                font type of the line
-            font_size : float
-                font size of the line
-            """
-            spans = curr_line["spans"]
-            max_accumulated_length = 0
-            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
-            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
-            for span in spans:
-                if span["text"].isspace():
-                    continue
-                span_length = span["bbox"][2] - span["bbox"][0]
-                if span_length > max_accumulated_length:
-                    max_accumulated_length = span_length
-                    max_span_font_size = span["size"]
-                    max_span_font_type = span["font"].lower()
-
-            return max_span_font_type, max_span_font_size
-
-        def __is_a_consistent_sub_title(pre_line, curr_line):
-            """
-            This function checks if the current line is a consistent sub title.
-
-            Parameters
-            ----------
-            pre_line : dict
-                previous line
-            curr_line : dict
-                current line
-
-            Returns
-            -------
-            bool
-                True if the current line is a consistent sub title, False otherwise.
-            """
-            if pre_line is None:
-                return False
-
-            start_letter_of_pre_line = pre_line["text"][0]
-            start_letter_of_curr_line = curr_line["text"][0]
-
-            has_same_prefix_digit = (
-                start_letter_of_pre_line.isdigit()
-                and start_letter_of_curr_line.isdigit()
-                and start_letter_of_pre_line == start_letter_of_curr_line
-            )
-
-            # prefix text of curr_line satisfies the following title format: x.x
-            prefix_text_pattern = r"^\d+\.\d+"
-            has_subtitle_format = re.match(prefix_text_pattern, curr_line["text"])
-
-            if has_same_prefix_digit or has_subtitle_format:
-                return True
-
-        """
-        Title detecting main Process.
-        """
-
-        """
-        Basic features about the current line.
-        """
-        curr_line_bbox = curr_line["bbox"]
-        curr_line_text = curr_line["text"]
-        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
-
-        if len(curr_line_text.strip()) == 0:  # skip empty lines
-            return False, False
-
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        if prev_line:
-            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
-        else:
-            prev_line_font_type, prev_line_font_size = None, None
-
-        next_line_bbox = next_line["bbox"] if next_line else None
-        if next_line:
-            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
-        else:
-            next_line_font_type, next_line_font_size = None, None
-
-        """
-        Aggregated features about the current line.
-        """
-        is_italc_font = __is_italic_font_line(curr_line)
-        is_bold_font = __is_bold_font_line(curr_line)
-
-        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
-        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
-        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
-
-        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
-
-        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
-
-        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
-        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
-
-        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
-
-        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
-        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
-
-        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
-            curr_line_font_size, prev_line_font_size, next_line_font_size
-        )
-
-        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
-            curr_line_font_type, prev_line_font_type, next_line_font_type
-        )
-
-        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
-        )
-
-        is_similar_to_pre_line = __is_similar_to_pre_line(
-            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
-        )
-
-        is_consis_sub_title = __is_a_consistent_sub_title(prev_line, curr_line)
-
-        """
-        Further aggregated features about the current line.
-        
-        Attention:
-            Features that start with __ are for internal use.
-        """
-
-        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
-        )
-        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
-        is_a_left_inline_title = (
-            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
-        )
-
-        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
-        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
-
-        is_title_by_check_pre_and_next_line = (
-            (prev_line is not None or next_line is not None)
-            and has_sufficient_spaces_above
-            and has_sufficient_spaces_below
-            and is_potential_title_font
-        )
-
-        is_numbered_title = __is_numbered_title(curr_line_text) and (
-            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
-        )
-
-        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
-
-        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
-
-        is_equation = __is_equation(curr_line_text)
-
-        is_title_by_len = __is_title_by_len(curr_line_text)
-
-        """
-        Decide if the line is a title.
-        """
-
-        is_title = (
-            is_not_end_with_ending_puncs  # not end with ending punctuation marks
-            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
-            and is_title_by_len  # is a title by length, default max length is 200
-            and not is_equation  # an interline equation should never be a title
-            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
-            and (
-                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
-                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                or (
-                    is_much_larger_font_than_doc_avg
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-                or (
-                    is_font_size_little_less_than_doc_avg
-                    and is_bold_font
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-            )  # Consider the following situations: bold font, much larger font than doc avg, not same font type as doc avg, sufficient spacing above and below
-            and (
-                (
-                    not is_person_or_org_list_line_by_nlp
-                    and (
-                        is_much_larger_font_than_doc_avg
-                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
-                    )
-                )
-                or (
-                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
-                    and not is_a_left_inline_title
-                    and not is_punctuation_heavy
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-                or (
-                    is_person_or_org_list_line_by_nlp
-                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                )
-                or (is_numbered_title and not is_a_left_inline_title)
-            )  # Exclude the following situations: person/org list
-        )
-        # ) or (prev_line_is_title and is_consis_sub_title)
-
-        is_name_or_org_list_to_be_removed = (
-            (is_person_or_org_list_line_by_nlp)
-            and is_punctuation_heavy
-            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-        ) and not is_title
-
-        if is_name_or_org_list_to_be_removed:
-            is_author_or_org_list = True
-        else:
-            is_author_or_org_list = False
-
-        # return is_title, is_author_or_org_list
-
-        """
-        # print reason why the line is a title
-        if is_title:
-            print_green("This line is a title.")
-            print_green("↓" * 10)
-            print()
-            print("curr_line_text: ", curr_line_text)
-            print()
-
-        # print reason why the line is not a title
-        line_text = curr_line_text.strip()
-        test_text = "Career/Personal Life"
-        text_content_condition = line_text == test_text
-        
-        if not is_title and text_content_condition: # Print specific line
-        # if not is_title: # Print each line
-            print_red("This line is not a title.")
-            print_red("↓" * 10)
-
-            print()
-            print("curr_line_text: ", curr_line_text)
-            print()
-
-            if is_not_end_with_ending_puncs:
-                print_green(f"is_not_end_with_ending_puncs")
-            else:
-                print_red(f"is_end_with_ending_puncs")
-
-            if is_not_only_no_meaning_symbols:
-                print_green(f"is_not_only_no_meaning_symbols")
-            else:
-                print_red(f"is_only_no_meaning_symbols")
-
-            if is_title_by_len:
-                print_green(f"is_title_by_len: {is_title_by_len}")
-            else:
-                print_red(f"is_not_title_by_len: {is_title_by_len}")
-
-            if is_equation:
-                print_red(f"is_equation")
-            else:
-                print_green(f"is_not_equation")
-
-            if is_potential_title_font:
-                print_green(f"is_potential_title_font")
-            else:
-                print_red(f"is_not_potential_title_font")
-
-            if is_punctuation_heavy:
-                print_red("is_punctuation_heavy")
-            else:
-                print_green("is_not_punctuation_heavy")
-
-            if is_bold_font:
-                print_green(f"is_bold_font")
-            else:
-                print_red(f"is_not_bold_font")
-
-            if is_font_size_not_less_than_doc_avg:
-                print_green(f"is_larger_font_than_doc_avg")
-            else:
-                print_red(f"is_not_larger_font_than_doc_avg")
-
-            if is_much_larger_font_than_doc_avg:
-                print_green(f"is_much_larger_font_than_doc_avg")
-            else:
-                print_red(f"is_not_much_larger_font_than_doc_avg")
-
-            if is_not_same_font_type_of_docAvg:
-                print_green(f"is_not_same_font_type_of_docAvg")
-            else:
-                print_red(f"is_same_font_type_of_docAvg")
-
-            if is_word_list_line_by_rules:
-                print_red("is_word_list_line_by_rules")
-            else:
-                print_green("is_not_name_list_by_rules")
-
-            if is_person_or_org_list_line_by_nlp:
-                print_red("is_person_or_org_list_line_by_nlp")
-            else:
-                print_green("is_not_person_or_org_list_line_by_nlp")
-
-            if not is_numbered_title:
-                print_red("is_not_numbered_title")
-            else:
-                print_green("is_numbered_title")
-
-            if is_a_left_inline_title:
-                print_red("is_a_left_inline_title")
-            else:
-                print_green("is_not_a_left_inline_title")
-
-            if not is_title_by_check_prev_line:
-                print_red("is_not_title_by_check_prev_line")
-            else:
-                print_green("is_title_by_check_prev_line")
-
-            if not is_title_by_check_next_line:
-                print_red("is_not_title_by_check_next_line")
-            else:
-                print_green("is_title_by_check_next_line")
-
-            if not is_title_by_check_pre_and_next_line:
-                print_red("is_not_title_by_check_pre_and_next_line")
-            else:
-                print_green("is_title_by_check_pre_and_next_line")
-
-        # print_green("Common features:")
-        # print_green("↓" * 10)
-
-        # print(f"    curr_line_font_type: {curr_line_font_type}")
-        # print(f"    curr_line_font_size: {curr_line_font_size}")
-        # print()
-
-        """
-
-        return is_title, is_author_or_org_list
-
-    def _detect_title(self, input_block):
-        """
-        Use the functions 'is_potential_title' to detect titles of each paragraph block.
-        If a line is a title, then the value of key 'is_title' of the line will be set to True.
-        """
-
-        raw_lines = input_block["lines"]
-
-        prev_line_is_title_flag = False
-
-        for i, curr_line in enumerate(raw_lines):
-            prev_line = raw_lines[i - 1] if i > 0 else None
-            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
-
-            blk_avg_char_width = input_block["avg_char_width"]
-            blk_avg_char_height = input_block["avg_char_height"]
-            blk_media_font_size = input_block["median_font_size"]
-
-            is_title, is_author_or_org_list = self._is_potential_title(
-                curr_line,
-                prev_line,
-                prev_line_is_title_flag,
-                next_line,
-                blk_avg_char_width,
-                blk_avg_char_height,
-                blk_media_font_size,
-            )
-
-            if is_title:
-                curr_line["is_title"] = is_title
-                prev_line_is_title_flag = True
-            else:
-                curr_line["is_title"] = False
-                prev_line_is_title_flag = False
-
-            # print(f"curr_line['text']: {curr_line['text']}")
-            # print(f"curr_line['is_title']: {curr_line['is_title']}")
-            # print(f"prev_line['text']: {prev_line['text'] if prev_line else None}")
-            # print(f"prev_line_is_title_flag: {prev_line_is_title_flag}")
-            # print()
-
-            if is_author_or_org_list:
-                curr_line["is_author_or_org_list"] = is_author_or_org_list
-            else:
-                curr_line["is_author_or_org_list"] = False
-
-        return input_block
-
-    def batch_detect_titles(self, pdf_dic):
-        """
-        This function batch process the blocks to detect titles.
-
-        Parameters
-        ----------
-        pdf_dict : dict
-            result dictionary
-
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-        num_titles = 0
-
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in blocks.keys():
-                    para_blocks = blocks["para_blocks"]
-
-                    all_single_line_blocks = []
-                    for block in para_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-
-                    new_para_blocks = []
-                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
-                        for para_block in para_blocks:
-                            new_block = self._detect_title(para_block)
-                            new_para_blocks.append(new_block)
-                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
-                    else:  # All blocks are single line blocks.
-                        for para_block in para_blocks:
-                            new_para_blocks.append(para_block)
-                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
-                    para_blocks = new_para_blocks
-
-                blocks["para_blocks"] = para_blocks
-
-                for para_block in para_blocks:
-                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
-                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
-                    if (
-                        all_titles and para_text_len < 200
-                    ):  # total length of the paragraph is less than 200, more than this should not be a title
-                        para_block["is_block_title"] = 1
-                    else:
-                        para_block["is_block_title"] = 0
-
-                    all_name_or_org_list_to_be_removed = all(
-                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
-                    )
-                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
-                        para_block["is_block_an_author_or_org_list"] = 1
-                    else:
-                        para_block["is_block_an_author_or_org_list"] = 0
-
-        pdf_dic["statistics"]["num_titles"] = num_titles
-
-        return pdf_dic
-
-    def _recog_title_level(self, title_blocks):
-        """
-        This function determines the title level based on the font size of the title.
-
-        Parameters
-        ----------
-        title_blocks : list
-
-        Returns
-        -------
-        title_blocks : list
-        """
-
-        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
-
-        # Use the mean and std of font sizes to remove extreme values
-        mean_font_size = np.mean(font_sizes)
-        std_font_size = np.std(font_sizes)
-        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
-        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
-
-        # Compute the threshold for title level
-        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
-        if middle_font_sizes.size > 0:
-            middle_mean_font_size = np.mean(middle_font_sizes)
-            level_threshold = middle_mean_font_size
-        else:
-            level_threshold = mean_font_size
-
-        for tb in title_blocks:
-            title_block = tb["block"]
-            title_font_size = safe_get(title_block, "block_font_size", 0)
-
-            current_level = 1  # Initialize title level, the biggest level is 1
-
-            # print(f"Before adjustment by font size, {current_level}")
-            if title_font_size >= max_extreme_font_size:
-                current_level = 1
-            elif title_font_size <= min_extreme_font_size:
-                current_level = 3
-            elif float(title_font_size) >= float(level_threshold):
-                current_level = 2
-            else:
-                current_level = 3
-            # print(f"After adjustment by font size, {current_level}")
-
-            title_block["block_title_level"] = current_level
-
-        return title_blocks
-
-    def batch_recog_title_level(self, pdf_dic):
-        """
-        This function batch process the blocks to recognize title level.
-
-        Parameters
-        ----------
-        pdf_dict : dict
-            result dictionary
-
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-        title_blocks = []
-
-        # Collect all titles
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = blocks.get("para_blocks", [])
-                for block in para_blocks:
-                    if block.get("is_block_title"):
-                        title_obj = {"page_id": page_id, "block": block}
-                        title_blocks.append(title_obj)
-
-        # Determine title level
-        if title_blocks:
-            # Determine title level based on font size
-            title_blocks = self._recog_title_level(title_blocks)
-
-        return pdf_dic
-
-
-class BlockTerminationProcessor:
-    """
-    This class is used to process the block termination.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def _is_consistent_lines(
-        self,
-        curr_line,
-        prev_line,
-        next_line,
-        consistent_direction,  # 0 for prev, 1 for next, 2 for both
-    ):
-        """
-        This function checks if the line is consistent with its neighbors
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        consistent_direction : int
-            0 for prev, 1 for next, 2 for both
-
-        Returns
-        -------
-        bool
-            True if the line is consistent with its neighbors, False otherwise.
-        """
-
-        curr_line_font_size = curr_line["spans"][0]["size"]
-        curr_line_font_type = curr_line["spans"][0]["font"].lower()
-
-        if consistent_direction == 0:
-            if prev_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
-            else:
-                return False
-
-        elif consistent_direction == 1:
-            if next_line:
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-            else:
-                return False
-
-        elif consistent_direction == 2:
-            if prev_line and next_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
-                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-                )
-            else:
-                return False
-
-        else:
-            return False
-
-    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
-        """
-        This function checks if the line is a regular line
-
-        Parameters
-        ----------
-        curr_line_bbox : list
-            bbox of the current line
-        prev_line_bbox : list
-            bbox of the previous line
-        next_line_bbox : list
-            bbox of the next line
-        avg_char_width : float
-            average of char widths
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_line_height : float
-            average of line heights
-
-        Returns
-        -------
-        bool
-            True if the line is a regular line, False otherwise.
-        """
-        horizontal_ratio = 0.5
-        vertical_ratio = 0.5
-        horizontal_thres = horizontal_ratio * avg_char_width
-        vertical_thres = vertical_ratio * avg_line_height
-
-        x0, y0, x1, y1 = curr_line_bbox
-
-        x0_near_X0 = abs(x0 - X0) < horizontal_thres
-        x1_near_X1 = abs(x1 - X1) < horizontal_thres
-
-        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
-
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-
-        return (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (not x0_near_X0 and not x1_near_X1)
-            or prev_line_is_end_of_para
-        )
-
-    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
-        """
-        This function checks if the line is a possible start of a paragraph
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        avg_line_height : float
-            average of line heights
-
-        Returns
-        -------
-        bool
-            True if the line is a possible start of a paragraph, False otherwise.
-        """
-        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
-        decision_path = []  # Record the decision path
-
-        curr_line_bbox = curr_line["bbox"]
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        next_line_bbox = next_line["bbox"] if next_line else None
-
-        indent_ratio = 1
-
-        vertical_ratio = 1.5
-        vertical_thres = vertical_ratio * avg_font_size
-
-        left_horizontal_ratio = 0.5
-        left_horizontal_thres = left_horizontal_ratio * avg_char_width
-
-        right_horizontal_ratio = 2.5
-        right_horizontal_thres = right_horizontal_ratio * avg_char_width
-
-        x0, y0, x1, y1 = curr_line_bbox
-
-        indent_condition = x0 > X0 + indent_ratio * avg_char_width
-        if indent_condition:
-            start_confidence += 0.2
-            decision_path.append("indent_condition_met")
-
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
-        if x0_near_X0:
-            start_confidence += 0.1
-            decision_path.append("x0_near_X0")
-
-        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
-        if x1_near_X1:
-            start_confidence += 0.1
-            decision_path.append("x1_near_X1")
-
-        if prev_line is None:
-            prev_line_is_end_of_para = True
-            start_confidence += 0.2
-            decision_path.append("no_prev_line")
-        else:
-            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
-            if prev_line_is_end_of_para:
-                start_confidence += 0.1
-                decision_path.append("prev_line_is_end_of_para")
-
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            if sufficient_spacing_above:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_above")
-
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            if sufficient_spacing_below:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_below")
-
-        is_regular_line = self._is_regular_line(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
-        )
-        if is_regular_line:
-            start_confidence += 0.1
-            decision_path.append("is_regular_line")
-
-        is_start_of_para = (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (indent_condition)
-            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
-            or prev_line_is_end_of_para
-        )
-        return (is_start_of_para, start_confidence, decision_path)
-
-    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
-        """
-        This function checks if the line is a possible end of a paragraph
-
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-
-        Returns
-        -------
-        bool
-            True if the line is a possible end of a paragraph, False otherwise.
-        """
-
-        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
-        decision_path = []  # Record the decision path
-
-        curr_line_bbox = curr_line["bbox"]
-        next_line_bbox = next_line["bbox"] if next_line else None
-
-        left_horizontal_ratio = 0.5
-        right_horizontal_ratio = 0.5
-
-        x0, _, x1, y1 = curr_line_bbox
-        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
-        if x0_near_X0:
-            end_confidence += 0.1
-            decision_path.append("x0_near_X0")
-
-        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
-        if x1_smaller_than_X1:
-            end_confidence += 0.1
-            decision_path.append("x1_smaller_than_X1")
-
-        next_line_is_start_of_para = (
-            next_line_bbox
-            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
-            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
-        )
-        if next_line_is_start_of_para:
-            end_confidence += 0.2
-            decision_path.append("next_line_is_start_of_para")
-
-        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if is_line_left_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_left_aligned_from_neighbors")
-
-        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if not is_line_right_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_not_right_aligned_from_neighbors")
-
-        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
-            (x0_near_X0 and x1_smaller_than_X1)
-            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
-        )
-
-        return (is_end_of_para, end_confidence, decision_path)
-
-    def _cut_paras_per_block(
-        self,
-        block,
-    ):
-        """
-        Processes a raw block from PyMuPDF and returns the processed block.
-
-        Parameters
-        ----------
-        raw_block : dict
-            A raw block from pymupdf.
-
-        Returns
-        -------
-        processed_block : dict
-
-        """
-
-        def _construct_para(lines, is_block_title, para_title_level):
-            """
-            Construct a paragraph from given lines.
-            """
-
-            font_sizes = [span["size"] for line in lines for span in line["spans"]]
-            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
-
-            font_colors = [span["color"] for line in lines for span in line["spans"]]
-            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
-
-            font_type_lengths = {}
-            for line in lines:
-                for span in line["spans"]:
-                    font_type = span["font"]
-                    bbox_width = span["bbox"][2] - span["bbox"][0]
-                    if font_type in font_type_lengths:
-                        font_type_lengths[font_type] += bbox_width
-                    else:
-                        font_type_lengths[font_type] = bbox_width
-
-            # get the font type with the longest bbox width
-            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
-
-            para_bbox = calculate_para_bbox(lines)
-            para_text = " ".join(line["text"] for line in lines)
-
-            return {
-                "para_bbox": para_bbox,
-                "para_text": para_text,
-                "para_font_type": most_common_font_type,
-                "para_font_size": avg_font_size,
-                "para_font_color": most_common_font_color,
-                "is_para_title": is_block_title,
-                "para_title_level": para_title_level,
-            }
-
-        block_bbox = block["bbox"]
-        block_text = block["text"]
-        block_lines = block["lines"]
-
-        X0 = safe_get(block, "X0", 0)
-        X1 = safe_get(block, "X1", 0)
-        avg_char_width = safe_get(block, "avg_char_width", 0)
-        avg_char_height = safe_get(block, "avg_char_height", 0)
-        avg_font_size = safe_get(block, "avg_font_size", 0)
-
-        is_block_title = safe_get(block, "is_block_title", False)
-        para_title_level = safe_get(block, "block_title_level", 0)
-
-        # Segment into paragraphs
-        para_ranges = []
-        in_paragraph = False
-        start_idx_of_para = None
-
-        # Create the processed paragraphs
-        processed_paras = {}
-        para_bboxes = []
-        end_idx_of_para = 0
-
-        for line_index, line in enumerate(block_lines):
-            curr_line = line
-            prev_line = block_lines[line_index - 1] if line_index > 0 else None
-            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
-
-            """
-            Start processing paragraphs.
-            """
-
-            # Check if the line is the start of a paragraph
-            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
-                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
-            )
-            if not in_paragraph and is_start_of_para:
-                in_paragraph = True
-                start_idx_of_para = line_index
-
-                # print_green(">>> Start of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    start_confidence: ", start_confidence)
-                # print("    decision_path: ", decision_path)
-
-            # Check if the line is the end of a paragraph
-            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
-                curr_line, next_line, X0, X1, avg_char_width
-            )
-            if in_paragraph and (is_end_of_para or not next_line):
-                para_ranges.append((start_idx_of_para, line_index))
-                start_idx_of_para = None
-                in_paragraph = False
-
-                # print_red(">>> End of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    end_confidence: ", end_confidence)
-                # print("    decision_path: ", decision_path)
-
-        # Add the last paragraph if it is not added
-        if in_paragraph and start_idx_of_para is not None:
-            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
-
-        # Process the matched paragraphs
-        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
-            matched_lines = block_lines[start_idx : end_idx + 1]
-            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
-            para_key = f"para_{len(processed_paras)}"
-            processed_paras[para_key] = para_properties
-            para_bboxes.append(para_properties["para_bbox"])
-            end_idx_of_para = end_idx + 1
-
-        # Deal with the remaining lines
-        if end_idx_of_para < len(block_lines):
-            unmatched_lines = block_lines[end_idx_of_para:]
-            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
-            unmatched_key = f"para_{len(processed_paras)}"
-            processed_paras[unmatched_key] = unmatched_properties
-            para_bboxes.append(unmatched_properties["para_bbox"])
-
-        block["paras"] = processed_paras
-
-        return block
-
-    def batch_process_blocks(self, pdf_dict):
-        """
-        Parses the blocks of all pages.
-
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-        filter_blocks : list
-            List of bounding boxes to filter.
-
-        Returns
-        -------
-        result_dict : dict
-            Result dictionary.
-
-        """
-
-        num_paras = 0
-
-        for page_id, page in pdf_dict.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in page.keys():
-                    input_blocks = page["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self._cut_paras_per_block(input_block)
-                        para_blocks.append(new_block)
-                        num_paras += len(new_block["paras"])
-
-                page["para_blocks"] = para_blocks
-
-        pdf_dict["statistics"]["num_paras"] = num_paras
-        return pdf_dict
-
-
-class BlockContinuationProcessor:
-    """
-    This class is used to process the blocks to detect block continuations.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __is_similar_font_type(self, font_type_1, font_type_2, prefix_length_ratio=0.3):
-        """
-        This function checks if the two font types are similar.
-        Definition of similar font types: the two font types have a common prefix,
-        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
-
-        Parameters
-        ----------
-        font_type1 : str
-            font type 1
-        font_type2 : str
-            font type 2
-        prefix_length_ratio : float
-            minimum ratio of the common prefix length to the length of the shorter font type
-
-        Returns
-        -------
-        bool
-            True if the two font types are similar, False otherwise.
-        """
-
-        if isinstance(font_type_1, list):
-            font_type_1 = font_type_1[0] if font_type_1 else ""
-        if isinstance(font_type_2, list):
-            font_type_2 = font_type_2[0] if font_type_2 else ""
-
-        if font_type_1 == font_type_2:
-            return True
-
-        # Find the length of the common prefix
-        common_prefix_length = len(os.path.commonprefix([font_type_1, font_type_2]))
-
-        # Calculate the minimum prefix length based on the ratio
-        min_prefix_length = int(min(len(font_type_1), len(font_type_2)) * prefix_length_ratio)
-
-        return common_prefix_length >= min_prefix_length
-
-    def __is_same_block_font(self, block_1, block_2):
-        """
-        This function compares the font of block1 and block2
-
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 have the same font, else False
-        """
-        block_1_font_type = safe_get(block_1, "block_font_type", "")
-        block_1_font_size = safe_get(block_1, "block_font_size", 0)
-        block_1_avg_char_width = safe_get(block_1, "avg_char_width", 0)
-
-        block_2_font_type = safe_get(block_2, "block_font_type", "")
-        block_2_font_size = safe_get(block_2, "block_font_size", 0)
-        block_2_avg_char_width = safe_get(block_2, "avg_char_width", 0)
-
-        if isinstance(block_1_font_size, list):
-            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
-        if isinstance(block_2_font_size, list):
-            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
-
-        block_1_text = safe_get(block_1, "text", "")
-        block_2_text = safe_get(block_2, "text", "")
-
-        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
-            return False
-
-        if not block_1_text or not block_2_text:
-            return False
-        else:
-            text_len_ratio = len(block_2_text) / len(block_1_text)
-            if text_len_ratio < 0.2:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.5
-                )
-            else:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.2
-                )
-
-        block_font_size_condition = abs(block_1_font_size - block_2_font_size) < 1
-
-        return (
-            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
-            and avg_char_width_condition
-            and block_font_size_condition
-        )
-
-    def _is_alphabet_char(self, char):
-        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
-            return True
-        else:
-            return False
-
-    def _is_chinese_char(self, char):
-        if char >= "\u4e00" and char <= "\u9fa5":
-            return True
-        else:
-            return False
-
-    def _is_other_letter_char(self, char):
-        try:
-            cat = unicodedata.category(char)
-            if cat == "Lu" or cat == "Ll":
-                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
-        except TypeError:
-            print("The input to the function must be a single character.")
-        return False
-
-    def _is_year(self, s: str):
-        try:
-            number = int(s)
-            return 1900 <= number <= 2099
-        except ValueError:
-            return False
-
-    def _match_brackets(self, text):
-        # pattern = r"^[\(\)\[\]（）【】{}｛｝<>＜＞〔〕〘〙\"\'“”‘’]"
-        pattern = r"^[\(\)\]（）】{}｛｝>＞〕〙\"\'“”‘’]"
-        return bool(re.match(pattern, text))
-
-    def _is_para_font_consistent(self, para_1, para_2):
-        """
-        This function compares the font of para1 and para2
-
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 have the same font, else False
-        """
-        if para_1 is None or para_2 is None:
-            return False
-
-        para_1_font_type = safe_get(para_1, "para_font_type", "")
-        para_1_font_size = safe_get(para_1, "para_font_size", 0)
-        para_1_font_color = safe_get(para_1, "para_font_color", "")
-
-        para_2_font_type = safe_get(para_2, "para_font_type", "")
-        para_2_font_size = safe_get(para_2, "para_font_size", 0)
-        para_2_font_color = safe_get(para_2, "para_font_color", "")
-
-        if isinstance(para_1_font_type, list):  # get the most common font type
-            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
-        if isinstance(para_2_font_type, list):
-            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
-        if isinstance(para_1_font_size, list):  # compute average font type
-            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
-        if isinstance(para_2_font_size, list):  # compute average font type
-            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
-
-        return (
-            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
-            and abs(para_1_font_size - para_2_font_size) < 1.5
-            # and para_font_color1 == para_font_color2
-        )
-
-    def _is_para_puncs_consistent(self, para_1, para_2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
-
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph by using the puncs, else False
-        """
-        para_1_text = safe_get(para_1, "para_text", "").strip()
-        para_2_text = safe_get(para_2, "para_text", "").strip()
-
-        para_1_bboxes = safe_get(para_1, "para_bbox", [])
-        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
-
-        para_2_bboxes = safe_get(para_2, "para_bbox", [])
-        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
-
-        # print_yellow("    Features of determine puncs_consistent:")
-        # print(f"    para_1_text: {para_1_text}")
-        # print(f"    para_2_text: {para_2_text}")
-        # print(f"    para_1_bboxes: {para_1_bboxes}")
-        # print(f"    para_2_bboxes: {para_2_bboxes}")
-        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
-        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
-
-        if is_nested_list(para_1_bboxes):
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
-        else:
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
-
-        if is_nested_list(para_2_bboxes):
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
-            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
-        else:
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
-
-        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
-
-        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
-        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
-
-        # Check if either para_text1 or para_text2 is empty
-        if not para_1_text or not para_2_text:
-            return False
-
-        # Define the end puncs for a sentence to end and hyphen
-        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-        hyphen = ["-", "—"]
-
-        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
-        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
-        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
-        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
-        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
-
-        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
-        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
-        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
-        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
-
-        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
-            # print_red(f"para_1 is end with hyphen.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] in hyphen
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-
-        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
-            # print_red(f"para_1 is end with end_punc.")
-            para_2_is_consistent = (
-                para_2_text
-                and (
-                    para_2_text[0]
-                    == " "
-                    # or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
-                    # or (self._is_chinese_char(para_2_text[0]))
-                    # or (self._is_other_letter_char(para_2_text[0]))
-                )
-                and not is_para2_left_indent_than_papa1
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-
-        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
-            # print_red(f"para_1 is NOT end with end_punc.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_alphabet_char(para_2_text[0]))
-                or (self._is_year(para_2_text[0:4]))
-                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-                or (self._match_brackets(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-
-        elif para_1_end_with_space:  # If para_text1 ends with space
-            # print_red(f"para_1 is end with space.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                pass
-                # print(f"para_2 is not consistent.\n")
-
-        return False
-
-    def _is_block_consistent(self, block_1, block_2):
-        """
-        This function determines whether block1 and block2 are originally from the same block
-
-        Parameters
-        ----------
-        block1 : dict
-            block1s
-        block2 : dict
-            block2
-
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 are from the same block, else False
-        """
-        return self.__is_same_block_font(block_1, block_2)
-
-    def _is_para_continued(self, para_1, para_2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph
-
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph, else False
-        """
-        is_para_font_consistent = self._is_para_font_consistent(para_1, para_2)
-        is_para_puncs_consistent = self._is_para_puncs_consistent(para_1, para_2)
-
-        return is_para_font_consistent and is_para_puncs_consistent
-
-    def _are_boundaries_of_block_consistent(self, block_1, block_2):
-        """
-        This function checks if the boundaries of block1 and block2 are consistent
-
-        Parameters
-        ----------
-        block1 : dict
-            block1
-
-        block2 : dict
-            block2
-
-        Returns
-        -------
-        is_consistent : bool
-            True if the boundaries of block1 and block2 are consistent, else False
-        """
-
-        last_line_of_block_1 = block_1["lines"][-1]
-        first_line_of_block_2 = block_2["lines"][0]
-
-        spans_of_last_line_of_block_1 = last_line_of_block_1["spans"]
-        spans_of_first_line_of_block_2 = first_line_of_block_2["spans"]
-
-        font_type_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["font"].lower()
-        font_size_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["size"]
-        font_color_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["color"]
-        font_flags_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["flags"]
-
-        font_type_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["font"].lower()
-        font_size_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["size"]
-        font_color_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["color"]
-        font_flags_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["flags"]
-
-        return (
-            self.__is_similar_font_type(font_type_of_last_line_of_block_1, font_type_of_first_line_of_block_2)
-            and abs(font_size_of_last_line_of_block_1 - font_size_of_first_line_of_block_2) < 1
-            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
-            and font_flags_of_last_line_of_block_1 == font_flags_of_first_line_of_block_2
-        )
-
-    def should_merge_next_para(self, curr_para, next_para):
-        """
-        This function checks if the next_para should be merged into the curr_para.
-
-        Parameters
-        ----------
-        curr_para : dict
-            The current paragraph.
-        next_para : dict
-            The next paragraph.
-
-        Returns
-        -------
-        bool
-            True if the next_para should be merged into the curr_para, False otherwise.
-        """
-        if self._is_para_continued(curr_para, next_para):
-            return True
-        else:
-            return False
-
-    def batch_tag_paras(self, pdf_dict):
-        """
-        This function tags the paragraphs in the pdf_dict.
-
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-
-        Returns
-        -------
-        pdf_dict : dict
-            PDF dictionary with tagged paragraphs.
-        """
-        the_last_page_id = len(pdf_dict) - 1
-
-        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
-            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
-                para_blocks_of_curr_page = curr_page_content["para_blocks"]
-                next_page_idx = curr_page_idx + 1
-                next_page_id = f"page_{next_page_idx}"
-                next_page_content = pdf_dict.get(next_page_id, {})
-
-                for i, current_block in enumerate(para_blocks_of_curr_page):
-                    for para_id, curr_para in current_block["paras"].items():
-                        curr_para["curr_para_location"] = [
-                            curr_page_idx,
-                            current_block["block_id"],
-                            int(para_id.split("_")[-1]),
-                        ]
-                        curr_para["next_para_location"] = None  # 默认设置为None
-                        curr_para["merge_next_para"] = False  # 默认设置为False
-
-                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
-
-                    if next_block:
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-
-                        next_block_first_para_key = list(next_block["paras"].keys())[0]
-                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
-
-                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                            curr_blk_last_para["next_para_location"] = [
-                                curr_page_idx,
-                                next_block["block_id"],
-                                int(next_block_first_para_key.split("_")[-1]),
-                            ]
-                            curr_blk_last_para["merge_next_para"] = True
-                    else:
-                        # Handle the case where the next block is in a different page
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-
-                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
-                            next_page_idx += 1
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id, {})
-
-                        if next_page_content.get("para_blocks", []):
-                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
-                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
-
-                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                                curr_blk_last_para["next_para_location"] = [
-                                    next_page_idx,
-                                    next_page_content["para_blocks"][0]["block_id"],
-                                    int(next_blk_first_para_key.split("_")[-1]),
-                                ]
-                                curr_blk_last_para["merge_next_para"] = True
-
-        return pdf_dict
-
-    def find_block_by_id(self, para_blocks, block_id):
-        """
-        This function finds a block by its id.
-
-        Parameters
-        ----------
-        para_blocks : list
-            List of blocks.
-        block_id : int
-            Id of the block to find.
-
-        Returns
-        -------
-        block : dict
-            The block with the given id.
-        """
-        for blk_idx, block in enumerate(para_blocks):
-            if block.get("block_id") == block_id:
-                return block
-        return None
-
-    def batch_merge_paras(self, pdf_dict):
-        """
-        This function merges the paragraphs in the pdf_dict.
-
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-
-        Returns
-        -------
-        pdf_dict : dict
-            PDF dictionary with merged paragraphs.
-        """
-        for page_id, page_content in pdf_dict.items():
-            if page_id.startswith("page_") and page_content.get("para_blocks", []):
-                para_blocks_of_page = page_content["para_blocks"]
-
-                for i in range(len(para_blocks_of_page)):
-                    current_block = para_blocks_of_page[i]
-                    paras = current_block["paras"]
-
-                    for para_id, curr_para in list(paras.items()):
-                        # print(f"current para_id: {para_id}")
-                        # 跳过标题段落
-                        if curr_para.get("is_para_title"):
-                            continue
-
-                        while curr_para.get("merge_next_para"):
-                            curr_para_location = curr_para.get("curr_para_location")
-                            next_para_location = curr_para.get("next_para_location")
-
-                            # print(f"curr_para_location: {curr_para_location}, next_para_location: {next_para_location}")
-                            
-                            if not next_para_location:
-                                break
-
-                            if curr_para_location == next_para_location:
-                                # print_red("The next para is in the same block as the current para.")
-                                curr_para["merge_next_para"] = False
-                                break
-
-                            next_page_idx, next_block_id, next_para_id = next_para_location
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id)
-                            if not next_page_content:
-                                break
-
-                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
-
-                            if not next_block:
-                                break
-
-                            next_para = next_block["paras"].get(f"para_{next_para_id}")
-
-                            if not next_para or next_para.get("is_para_title"):
-                                break
-
-                            # 合并段落文本
-                            curr_para_text = curr_para.get("para_text", "")
-                            next_para_text = next_para.get("para_text", "")
-                            curr_para["para_text"] = curr_para_text + " " + next_para_text
-
-                            # 更新 next_para_location
-                            curr_para["next_para_location"] = next_para.get("next_para_location")
-
-                            # 将下一个段落文本置为空，表示已被合并
-                            next_para["para_text"] = ""
-
-                            # 更新 merge_next_para 标记
-                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
-
-        return pdf_dict
-
-
-class DrawAnnos:
-    """
-    This class draws annotations on the pdf file
-
-    ----------------------------------------
-                Color Code
-    ----------------------------------------
-        Red: (1, 0, 0)
-        Green: (0, 1, 0)
-        Blue: (0, 0, 1)
-        Yellow: (1, 1, 0) - mix of red and green
-        Cyan: (0, 1, 1) - mix of green and blue
-        Magenta: (1, 0, 1) - mix of red and blue
-        White: (1, 1, 1) - red, green and blue full intensity
-        Black: (0, 0, 0) - no color component whatsoever
-        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
-        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __is_nested_list(self, lst):
-        """
-        This function returns True if the given list is a nested list of any degree.
-        """
-        if isinstance(lst, list):
-            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
-        return False
-
-    def __valid_rect(self, bbox):
-        # Ensure that the rectangle is not empty or invalid
-        if isinstance(bbox[0], list):
-            return False  # It's a nested list, hence it can't be valid rect
-        else:
-            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
-
-    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
-        """
-        This function draws the nested boxes
-
-        Parameters
-        ----------
-        page : fitz.Page
-            page
-        nested_bbox : list
-            nested bbox
-        color : tuple
-            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
-        """
-        if self.__is_nested_list(nested_bbox):  # If it's a nested list
-            for bbox in nested_bbox:
-                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
-        elif self.__valid_rect(nested_bbox):  # If valid rectangle
-            para_rect = fitz.Rect(nested_bbox)
-            para_anno = page.add_rect_annot(para_rect)
-            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
-            para_anno.set_border(width=1)
-            para_anno.update()
-
-    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
-        """
-        This function draws annotations on the pdf file.
-
-        Parameters
-        ----------
-        input_pdf_path : str
-            path to the input pdf file
-        pdf_dic : dict
-            pdf dictionary
-        output_pdf_path : str
-            path to the output pdf file
-
-        pdf_dic : dict
-            pdf dictionary
-        """
-        pdf_doc = open_pdf(input_pdf_path)
-
-        if pdf_dic is None:
-            pdf_dic = {}
-
-        if output_pdf_path is None:
-            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
-
-        for page_id, page in enumerate(pdf_doc):  # type: ignore
-            page_key = f"page_{page_id}"
-            for ele_key, ele_data in pdf_dic[page_key].items():
-                if ele_key == "para_blocks":
-                    para_blocks = ele_data
-                    for para_block in para_blocks:
-                        if "paras" in para_block.keys():
-                            paras = para_block["paras"]
-                            for para_key, para_content in paras.items():
-                                para_bbox = para_content["para_bbox"]
-                                # print(f"para_bbox: {para_bbox}")
-                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
-                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
-                                    color = (0, 1, 1)
-                                    self.__draw_nested_boxes(
-                                        page, para_bbox, color
-                                    )  # draw with cyan color for combined paragraph
-                                else:
-                                    if self.__valid_rect(para_bbox):
-                                        para_rect = fitz.Rect(para_bbox)
-                                        para_anno = page.add_rect_annot(para_rect)
-                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
-                                        para_anno.set_border(width=0.5)
-                                        para_anno.update()
-
-                                is_para_title = para_content["is_para_title"]
-                                if is_para_title:
-                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
-                                        color = (0, 0, 1)
-                                        self.__draw_nested_boxes(
-                                            page, para_content["para_bbox"], color
-                                        )  # draw with cyan color for combined title
-                                    else:
-                                        if self.__valid_rect(para_content["para_bbox"]):
-                                            para_rect = fitz.Rect(para_content["para_bbox"])
-                                            if self.__valid_rect(para_content["para_bbox"]):
-                                                para_anno = page.add_rect_annot(para_rect)
-                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
-                                                para_anno.set_border(width=0.5)
-                                                para_anno.update()
-
-        pdf_doc.save(output_pdf_path)
-        pdf_doc.close()
-
-
-class ParaProcessPipeline:
-    def __init__(self) -> None:
-        pass
-
-    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
-        """
-        This function processes the paragraphs, including:
-        1. Read raw input json file into pdf_dic
-        2. Detect and replace equations
-        3. Combine spans into a natural line
-        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
-        5. Compute statistics for each block
-        6. Detect titles in the document
-        7. Detect paragraphs inside each block
-        8. Divide the level of the titles
-        9. Detect and combine paragraphs from different blocks into one paragraph
-        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
-        11. Draw annotations on the pdf file
-
-        Parameters
-        ----------
-        pdf_dic_json_fpath : str
-            path to the pdf dictionary json file.
-            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
-        input_pdf_doc : str
-            path to the input pdf file
-        output_pdf_path : str
-            path to the output pdf file
-
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-
-        error_info = None
-
-        output_json_file = ""
-        output_dir = ""
-
-        if input_pdf_path is not None:
-            input_pdf_path = os.path.abspath(input_pdf_path)
-
-            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
-
-        if output_pdf_path is not None:
-            output_dir = os.path.dirname(output_pdf_path)
-            output_json_file = f"{output_dir}/pdf_dic.json"
-
-        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
-            """
-            Save the pdf_dic to a json file
-            """
-            output_pdf_file_name = os.path.basename(output_pdf_path)
-            # output_dir = os.path.dirname(output_pdf_path)
-            output_dir = "\\tmp\\pdf_parse"
-            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
-            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
-
-            if not os.path.exists(output_dir):
-                os.makedirs(output_dir)
-
-            if para_debug_mode == "full":
-                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
-                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
-
-            # Validate the output already exists
-            if not os.path.exists(pdf_dic_json_fpath):
-                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
-                return None
-            else:
-                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
-
-            return pdf_dic_json_fpath
-
-        """
-        Preprocess the lines of block
-        """
-        # Combine spans into a natural line
-        rawBlockProcessor = RawBlockProcessor()
-        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
-        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
-
-        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
-        layoutFilter = LayoutFilterProcessor()
-        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
-
-        # Compute statistics for each block
-        blockStatisticsCalculator = BlockStatisticsCalculator()
-        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
-        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
-
-        # Compute statistics for all blocks(namely this pdf document)
-        docStatisticsCalculator = DocStatisticsCalculator()
-        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
-        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
-
-        # Dump the first three stages of pdf_dic to a json file
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
-
-        """
-        Detect titles in the document
-        """
-        doc_statistics = pdf_dic["statistics"]
-        titleProcessor = TitleProcessor(doc_statistics)
-        pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
-
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
-
-        """
-        Detect and divide the level of the titles
-        """
-        titleProcessor = TitleProcessor()
-
-        pdf_dic = titleProcessor.batch_recog_title_level(pdf_dic)
-
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
-
-        """
-        Detect and split paragraphs inside each block
-        """
-        blockInnerParasProcessor = BlockTerminationProcessor()
-
-        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
-
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
-
-        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
-        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
-
-        """
-        Detect and combine paragraphs from different blocks into one paragraph
-        """
-        blockContinuationProcessor = BlockContinuationProcessor()
-
-        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
-        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
-
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
-
-        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
-        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
-
-        """
-        Discard pdf files by checking exceptions and return the error info to the caller
-        """
-        discardByException = DiscardByException()
-
-        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
-            pdf_dic, exception=DenseSingleLineBlockException()
-        )
-        is_discard_by_title_detection = discardByException.discard_by_title_detection(
-            pdf_dic, exception=TitleDetectionException()
-        )
-        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
-        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
-        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
-
-        if is_discard_by_single_line_block is not None:
-            error_info = is_discard_by_single_line_block
-        elif is_discard_by_title_detection is not None:
-            error_info = is_discard_by_title_detection
-        elif is_discard_by_title_level is not None:
-            error_info = is_discard_by_title_level
-        elif is_discard_by_split_para is not None:
-            error_info = is_discard_by_split_para
-        elif is_discard_by_merge_para is not None:
-            error_info = is_discard_by_merge_para
-
-        if error_info is not None:
-            return pdf_dic, error_info
-
-        """
-        Dump the final pdf_dic to a json file
-        """
-        if para_debug_mode is not None:
-            with open(output_json_file, "w", encoding="utf-8") as f:
-                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
-
-        """
-        Draw the annotations
-        """
-        if para_debug_mode is not None:
-            drawAnnos = DrawAnnos()
-            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
-
-        """
-        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
-        """
-        if para_debug_mode is not None:
-            for fpath in os.listdir(output_dir):
-                if fpath.endswith(".json") and "stage" in fpath:
-                    os.remove(os.path.join(output_dir, fpath))
-
-        return pdf_dic, error_info
-
-
-"""
-Run this script to test the function with Command: 
-
-python detect_para.py [pdf_path] [output_pdf_path]
-
-Params:
- pdf_path: the path of the pdf file
- output_pdf_path: the path of the output pdf file
-"""
-
-if __name__ == "__main__":
-    DEFAULT_PDF_PATH = (
-        "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
-    )
-    input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
-    output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
-    output_json_path = sys.argv[3] if len(sys.argv) > 3 else input_pdf_path.split(".")[0] + "_recogPara.json"
-
-    import stat
-
-    # Remove existing output file if it exists
-    if os.path.exists(output_pdf_path):
-        os.chmod(output_pdf_path, stat.S_IWRITE)
-        os.remove(output_pdf_path)
-
-    input_pdf_doc = open_pdf(input_pdf_path)
-
-    # postprocess the paragraphs
-    paraProcessPipeline = ParaProcessPipeline()
-
-    # parse paragraph and save to json file
-    pdf_dic = {}
-
-    blockInnerParasProcessor = BlockTerminationProcessor()
-
-    """
-    Construct the pdf dictionary.
-    """
-
-    for page_id, page in enumerate(input_pdf_doc):  # type: ignore
-        # print(f"Processing page {page_id}")
-        # print(f"page: {page}")
-        raw_blocks = page.get_text("dict")["blocks"]
-
-        # Save text blocks to "preproc_blocks"
-        preproc_blocks = []
-        for block in raw_blocks:
-            if block["type"] == 0:
-                preproc_blocks.append(block)
-
-        layout_bboxes = []
-
-        # Construct the pdf dictionary as schema above
-        page_dict = {
-            "para_blocks": None,
-            "preproc_blocks": preproc_blocks,
-            "images": None,
-            "tables": None,
-            "interline_equations": None,
-            "inline_equations": None,
-            "layout_bboxes": None,
-            "pymu_raw_blocks": None,
-            "global_statistic": None,
-            "droped_text_block": None,
-            "droped_image_block": None,
-            "droped_table_block": None,
-            "image_backup": None,
-            "table_backup": None,
-        }
-
-        pdf_dic[f"page_{page_id}"] = page_dict
-
-    # print(f"pdf_dic: {pdf_dic}")
-
-    with open(output_json_path, "w", encoding="utf-8") as f:
-        json.dump(pdf_dic, f, ensure_ascii=False, indent=4)
-
-    pdf_dic = paraProcessPipeline.para_process_pipeline(output_json_path, input_pdf_doc, output_pdf_path)