Initial commit

c2e5c36f · 赵小蒙 · c2e5c36f · c2e5c36f · c2e5c36f · c2e5c36f
Commit c2e5c36f authored Feb 29, 2024 by 赵小蒙
20 changed files
--- a/para/title_processor.py
+++ b/para/title_processor.py
+import os
+import sys
+import re
+import numpy as np
+from libs.nlp_utils import NLPModels
+from para.commons import *
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+class TitleProcessor:
+    def __init__(self, *doc_statistics) -> None:
+        if len(doc_statistics) > 0:
+            self.doc_statistics = doc_statistics[0]
+        self.nlp_model = NLPModels()
+        self.MAX_TITLE_LEVEL = 3
+        self.numbered_title_pattern = r"""
+            ^                                 # 行首
+            (                                 # 开始捕获组
+                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
+                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
+                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
+                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
+                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
+                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
+                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
+                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
+                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
+                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
+                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
+                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
+                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
+                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
+                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
+                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
+                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
+            )                                 # 结束捕获组
+            .+                                # 标题的其余部分
+        """
+    def _is_potential_title(
+        self,
+        curr_line,
+        prev_line,
+        prev_line_is_title,
+        next_line,
+        avg_char_width,
+        avg_char_height,
+        median_font_size,
+    ):
+        """
+        This function checks if the line is a potential title.
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        avg_char_width : float
+            average of char widths
+        avg_char_height : float
+            average of line heights
+        Returns
+        -------
+        bool
+            True if the line is a potential title, False otherwise.
+        """
+        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
+            """
+            This function checks if the line is centered on the page
+            Parameters
+            ----------
+            line_bbox : list
+                bbox of the line
+            page_bbox : list
+                bbox of the page
+            avg_char_width : float
+                average of char widths
+            Returns
+            -------
+            bool
+                True if the line is centered on the page, False otherwise.
+            """
+            horizontal_ratio = 0.5
+            horizontal_thres = horizontal_ratio * avg_char_width
+            x0, _, x1, _ = line_bbox
+            _, _, page_x1, _ = page_bbox
+            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
+        def __is_bold_font_line(line):
+            """
+            Check if a line contains any bold font style.
+            """
+            def _is_bold_span(span):
+                # if span text is empty or only contains space, return False
+                if not span["text"].strip():
+                    return False
+                return bool(span["flags"] & 2**4)  # Check if the font is bold
+            for span in line["spans"]:
+                if not _is_bold_span(span):
+                    return False
+            return True
+        def __is_italic_font_line(line):
+            """
+            Check if a line contains any italic font style.
+            """
+            def __is_italic_span(span):
+                return bool(span["flags"] & 2**1)  # Check if the font is italic
+            for span in line["spans"]:
+                if not __is_italic_span(span):
+                    return False
+            return True
+        def __is_punctuation_heavy(line_text):
+            """
+            Check if the line contains a high ratio of punctuation marks, which may indicate
+            that the line is not a title.
+            Parameters:
+            line_text (str): Text of the line.
+            Returns:
+            bool: True if the line is heavy with punctuation, False otherwise.
+            """
+            # Pattern for common title format like "X.Y. Title"
+            pattern = r"\b\d+\.\d+\..*\b"
+            # If the line matches the title format, return False
+            if re.match(pattern, line_text.strip()):
+                return False
+            # Find all punctuation marks in the line
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)
+            number_of_punctuation_marks = len(punctuation_marks)
+            text_length = len(line_text)
+            if text_length == 0:
+                return False
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.1:
+                return True
+            return False
+        def __has_mixed_font_styles(spans, strict_mode=False):
+            """
+            This function checks if the line has mixed font styles, the strict mode will compare the font types
+            Parameters
+            ----------
+            spans : list
+                spans of the line
+            strict_mode : bool
+                True for strict mode, the font types will be fully compared
+                False for non-strict mode, the font types will be compared by the most longest common prefix
+            Returns
+            -------
+            bool
+                True if the line has mixed font styles, False otherwise.
+            """
+            if strict_mode:
+                font_styles = set()
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.add(font_style)
+                return len(font_styles) > 1
+            else:  # non-strict mode
+                font_styles = []
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.append(font_style)
+                if len(font_styles) > 1:
+                    longest_common_prefix = os.path.commonprefix(font_styles)
+                    if len(longest_common_prefix) > 0:
+                        return False
+                    else:
+                        return True
+                else:
+                    return False
+        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
+            """
+            This function checks if the current line has a different font type from the previous and next lines
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            prev_line_font_type : str
+                font type of the previous line
+            next_line_font_type : str
+                font type of the next line
+            Returns
+            -------
+            bool
+                True if the current line has a different font type from the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_type != other_font_type.lower()
+                for other_font_type in [prev_line_font_type, next_line_font_type]
+                if other_font_type is not None
+            )
+        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
+            """
+            This function checks if the current line has a larger font size than the previous and next lines
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            prev_line_font_size : float
+                font size of the previous line
+            next_line_font_size : float
+                font size of the next line
+            Returns
+            -------
+            bool
+                True if the current line has a larger font size than the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_size > other_font_size * 1.2
+                for other_font_size in [prev_line_font_size, next_line_font_size]
+                if other_font_size is not None
+            )
+        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
+            """
+            This function checks if the current line is similar to the previous line
+            Parameters
+            ----------
+            curr_line : dict
+                current line
+            prev_line : dict
+                previous line
+            Returns
+            -------
+            bool
+                True if the current line is similar to the previous line, False otherwise.
+            """
+            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
+                return True
+            else:
+                return False
+        def __is_same_font_type_of_docAvg(curr_line_font_type):
+            """
+            This function checks if the current line has the same font type as the document average font type
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            Returns
+            -------
+            bool
+                True if the current line has the same font type as the document average font type, False otherwise.
+            """
+            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
+            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
+            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
+        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
+            """
+            This function checks if the current line has a large enough font size
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            ratio : float
+                ratio of the current line font size to the document average font size
+            Returns
+            -------
+            bool
+                True if the current line has a large enough font size, False otherwise.
+            """
+            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
+            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
+            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
+            return curr_line_font_size >= doc_avg_font_size * ratio
+        def __is_sufficient_spacing_above_and_below(
+            curr_line_bbox,
+            prev_line_bbox,
+            next_line_bbox,
+            avg_char_height,
+            median_font_size,
+        ):
+            """
+            This function checks if the current line has sufficient spacing above and below
+            Parameters
+            ----------
+            curr_line_bbox : list
+                bbox of the current line
+            prev_line_bbox : list
+                bbox of the previous line
+            next_line_bbox : list
+                bbox of the next line
+            avg_char_width : float
+                average of char widths
+            avg_char_height : float
+                average of line heights
+            Returns
+            -------
+            bool
+                True if the current line has sufficient spacing above and below, False otherwise.
+            """
+            vertical_ratio = 1.25
+            vertical_thres = vertical_ratio * median_font_size
+            _, y0, _, y1 = curr_line_bbox
+            sufficient_spacing_above = False
+            if prev_line_bbox:
+                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
+                sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            else:
+                sufficient_spacing_above = True
+            sufficient_spacing_below = False
+            if next_line_bbox:
+                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
+                sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            else:
+                sufficient_spacing_below = True
+            return (sufficient_spacing_above, sufficient_spacing_below)
+        def __is_word_list_line_by_rules(curr_line_text):
+            """
+            This function checks if the current line is a word list
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
+            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
+            compiled_pattern = re.compile(name_list_pattern)
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+        # """
+        def __get_text_catgr_by_nlp(curr_line_text):
+            """
+            This function checks if the current line is a name list using nlp model, such as spacy
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
+            return result
+        # """
+        def __is_numbered_title(curr_line_text):
+            """
+            This function checks if the current line is a numbered list
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a numbered list, False otherwise.
+            """
+            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+        def __is_end_with_ending_puncs(line_text):
+            """
+            This function checks if the current line ends with a ending punctuation mark
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line ends with a punctuation mark, False otherwise.
+            """
+            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+            line_text = line_text.rstrip()
+            if line_text[-1] in end_puncs:
+                return True
+            return False
+        def __contains_only_no_meaning_symbols(line_text):
+            """
+            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
+            Situation contains:
+            1. Only have punctuation marks
+            2. Only have other non-meaning symbols
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line contains only symbols that have no meaning, False otherwise.
+            """
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
+            number_of_punctuation_marks = len(punctuation_marks)
+            text_length = len(line_text)
+            if text_length == 0:
+                return False
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.9:
+                return True
+            return False
+        def __is_equation(line_text):
+            """
+            This function checks if the current line is an equation.
+            Parameters
+            ----------
+            line_text : str
+            Returns
+            -------
+            bool
+                True if the current line is an equation, False otherwise.
+            """
+            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
+            if re.search(equation_reg, line_text):
+                return True
+            else:
+                return False
+        def __is_title_by_len(text, max_length=200):
+            """
+            This function checks if the current line is a title by length.
+            Parameters
+            ----------
+            text : str
+                text of the current line
+            max_length : int
+                max length of the title
+            Returns
+            -------
+            bool
+                True if the current line is a title, False otherwise.
+            """
+            text = text.strip()
+            return len(text) <= max_length
+        def __compute_line_font_type_and_size(curr_line):
+            """
+            This function computes the font type and font size of the line.
+            Parameters
+            ----------
+            line : dict
+                line
+            Returns
+            -------
+            font_type : str
+                font type of the line
+            font_size : float
+                font size of the line
+            """
+            spans = curr_line["spans"]
+            max_accumulated_length = 0
+            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
+            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
+            for span in spans:
+                if span["text"].isspace():
+                    continue
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_accumulated_length:
+                    max_accumulated_length = span_length
+                    max_span_font_size = span["size"]
+                    max_span_font_type = span["font"].lower()
+            return max_span_font_type, max_span_font_size
+        """
+        Title detecting main Process.
+        """
+        """
+        Basic features about the current line.
+        """
+        curr_line_bbox = curr_line["bbox"]
+        curr_line_text = curr_line["text"]
+        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
+        if len(curr_line_text.strip()) == 0:  # skip empty lines
+            return False
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        if prev_line:
+            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
+        else:
+            prev_line_font_type, prev_line_font_size = None, None
+        next_line_bbox = next_line["bbox"] if next_line else None
+        if next_line:
+            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
+        else:
+            next_line_font_type, next_line_font_size = None, None
+        """
+        Aggregated features about the current line.
+        """
+        is_italc_font = __is_italic_font_line(curr_line)
+        is_bold_font = __is_bold_font_line(curr_line)
+        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
+        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
+        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
+        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
+        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
+        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
+        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
+        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
+        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
+        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
+        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
+            curr_line_font_size, prev_line_font_size, next_line_font_size
+        )
+        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
+            curr_line_font_type, prev_line_font_type, next_line_font_type
+        )
+        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
+        )
+        is_similar_to_pre_line = __is_similar_to_pre_line(
+            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
+        )
+        """
+        Further aggregated features about the current line.
+        Attention:
+            Features that start with __ are for internal use.
+        """
+        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
+        )
+        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
+        is_a_left_inline_title = (
+            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
+        )
+        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
+        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
+        is_title_by_check_pre_and_next_line = (
+            (prev_line is not None or next_line is not None)
+            and has_sufficient_spaces_above
+            and has_sufficient_spaces_below
+            and is_potential_title_font
+        )
+        is_numbered_title = __is_numbered_title(curr_line_text) and (
+            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
+        )
+        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
+        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
+        is_equation = __is_equation(curr_line_text)
+        is_title_by_len = __is_title_by_len(curr_line_text)
+        """
+        Decide if the line is a title.
+        """
+        # is_title = False
+        # if prev_line_is_title:
+        is_title = (
+            is_not_end_with_ending_puncs  # not end with ending punctuation marks
+            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
+            and is_title_by_len  # is a title by length, default max length is 200
+            and not is_equation  # an interline equation should never be a title
+            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
+            and (
+                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                or (
+                    is_much_larger_font_than_doc_avg
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_font_size_little_less_than_doc_avg
+                    and is_bold_font
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+            )  # not the same font type as the document average font type, which includes the most common font type and the second most common font type
+            and (
+                (
+                    not is_person_or_org_list_line_by_nlp
+                    and (
+                        is_much_larger_font_than_doc_avg
+                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                    )
+                )
+                or (
+                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
+                    and not is_a_left_inline_title
+                    and not is_punctuation_heavy
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_person_or_org_list_line_by_nlp
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                )
+                or (is_numbered_title and not is_a_left_inline_title)
+            )
+        )
+        # ) or (is_similar_to_pre_line and prev_line_is_title)
+        is_name_or_org_list_to_be_removed = (
+            (is_person_or_org_list_line_by_nlp)
+            and is_punctuation_heavy
+            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+        ) and not is_title
+        if is_name_or_org_list_to_be_removed:
+            is_author_or_org_list = True
+            # print curr_line_text to check
+            # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
+        else:
+            is_author_or_org_list = False
+        """
+        # print reason why the line is a title
+        if is_title:
+            print_green("This line is a title.")
+            print_green("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+        # print reason why the line is not a title
+        line_text = curr_line_text.strip()
+        test_text = "Career/Personal Life"
+        text_content_condition = line_text == test_text
+        if not is_title and text_content_condition: # Print specific line
+        # if not is_title: # Print each line
+            print_red("This line is not a title.")
+            print_red("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+            if is_not_end_with_ending_puncs:
+                print_green(f"is_not_end_with_ending_puncs")
+            else:
+                print_red(f"is_end_with_ending_puncs")
+            if is_not_only_no_meaning_symbols:
+                print_green(f"is_not_only_no_meaning_symbols")
+            else:
+                print_red(f"is_only_no_meaning_symbols")
+            if is_title_by_len:
+                print_green(f"is_title_by_len: {is_title_by_len}")
+            else:
+                print_red(f"is_not_title_by_len: {is_title_by_len}")
+            if is_equation:
+                print_red(f"is_equation")
+            else:
+                print_green(f"is_not_equation")
+            if is_potential_title_font:
+                print_green(f"is_potential_title_font")
+            else:
+                print_red(f"is_not_potential_title_font")
+            if is_punctuation_heavy:
+                print_red("is_punctuation_heavy")
+            else:
+                print_green("is_not_punctuation_heavy")
+            if is_bold_font:
+                print_green(f"is_bold_font")
+            else:
+                print_red(f"is_not_bold_font")
+            if is_font_size_not_less_than_doc_avg:
+                print_green(f"is_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_larger_font_than_doc_avg")
+            if is_much_larger_font_than_doc_avg:
+                print_green(f"is_much_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_much_larger_font_than_doc_avg")
+            if is_not_same_font_type_of_docAvg:
+                print_green(f"is_not_same_font_type_of_docAvg")
+            else:
+                print_red(f"is_same_font_type_of_docAvg")
+            if is_word_list_line_by_rules:
+                print_red("is_word_list_line_by_rules")
+            else:
+                print_green("is_not_name_list_by_rules")
+            if is_person_or_org_list_line_by_nlp:
+                print_red("is_person_or_org_list_line_by_nlp")
+            else:
+                print_green("is_not_person_or_org_list_line_by_nlp")
+            if not is_numbered_title:
+                print_red("is_not_numbered_title")
+            else:
+                print_green("is_numbered_title")
+            if is_a_left_inline_title:
+                print_red("is_a_left_inline_title")
+            else:
+                print_green("is_not_a_left_inline_title")
+            if not is_title_by_check_prev_line:
+                print_red("is_not_title_by_check_prev_line")
+            else:
+                print_green("is_title_by_check_prev_line")
+            if not is_title_by_check_next_line:
+                print_red("is_not_title_by_check_next_line")
+            else:
+                print_green("is_title_by_check_next_line")
+            if not is_title_by_check_pre_and_next_line:
+                print_red("is_not_title_by_check_pre_and_next_line")
+            else:
+                print_green("is_title_by_check_pre_and_next_line")
+        # print_green("Common features:")
+        # print_green("↓" * 10)
+        # print(f"    curr_line_font_type: {curr_line_font_type}")
+        # print(f"    curr_line_font_size: {curr_line_font_size}")
+        # print()
+        """
+        return is_title, is_author_or_org_list
+    def _detect_block_title(self, input_block):
+        """
+        Use the functions 'is_potential_title' to detect titles of each paragraph block.
+        If a line is a title, then the value of key 'is_title' of the line will be set to True.
+        """
+        raw_lines = input_block["lines"]
+        prev_line_is_title_flag = False
+        for i, curr_line in enumerate(raw_lines):
+            prev_line = raw_lines[i - 1] if i > 0 else None
+            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
+            blk_avg_char_width = input_block["avg_char_width"]
+            blk_avg_char_height = input_block["avg_char_height"]
+            blk_media_font_size = input_block["median_font_size"]
+            is_title, is_author_or_org_list = self._is_potential_title(
+                curr_line,
+                prev_line,
+                prev_line_is_title_flag,
+                next_line,
+                blk_avg_char_width,
+                blk_avg_char_height,
+                blk_media_font_size,
+            )
+            if is_title:
+                curr_line["is_title"] = is_title
+                prev_line_is_title_flag = True
+            else:
+                curr_line["is_title"] = False
+                prev_line_is_title_flag = False
+            if is_author_or_org_list:
+                curr_line["is_author_or_org_list"] = is_author_or_org_list
+            else:
+                curr_line["is_author_or_org_list"] = False
+        return input_block
+    def batch_process_blocks_detect_titles(self, pdf_dic):
+        """
+        This function batch process the blocks to detect titles.
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        num_titles = 0
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    all_single_line_blocks = []
+                    for block in para_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+                    new_para_blocks = []
+                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_block = self._detect_block_title(para_block)
+                            new_para_blocks.append(new_block)
+                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                    else:  # All blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_para_blocks.append(para_block)
+                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
+                    para_blocks = new_para_blocks
+                blocks["para_blocks"] = para_blocks
+                for para_block in para_blocks:
+                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
+                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
+                    if (
+                        all_titles and para_text_len < 200
+                    ):  # total length of the paragraph is less than 200, more than this should not be a title
+                        para_block["is_block_title"] = 1
+                    else:
+                        para_block["is_block_title"] = 0
+                    all_name_or_org_list_to_be_removed = all(
+                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
+                    )
+                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
+                        para_block["is_block_an_author_or_org_list"] = 1
+                    else:
+                        para_block["is_block_an_author_or_org_list"] = 0
+        pdf_dic["statistics"]["num_titles"] = num_titles
+        return pdf_dic
+    def __determine_size_based_level(self, title_blocks):
+        """
+        This function determines the title level based on the font size of the title.
+        Parameters
+        ----------
+        title_blocks : list
+        Returns
+        -------
+        title_blocks : list
+        """
+        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
+        # Use the mean and std of font sizes to remove extreme values
+        mean_font_size = np.mean(font_sizes)
+        std_font_size = np.std(font_sizes)
+        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
+        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
+        # Compute the threshold for title level
+        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
+        if middle_font_sizes.size > 0:
+            middle_mean_font_size = np.mean(middle_font_sizes)
+            level_threshold = middle_mean_font_size
+        else:
+            level_threshold = mean_font_size
+        for tb in title_blocks:
+            title_block = tb["block"]
+            title_font_size = safe_get(title_block, "block_font_size", 0)
+            current_level = 1  # Initialize title level, the biggest level is 1
+            # print(f"Before adjustment by font size, {current_level}")
+            if title_font_size >= max_extreme_font_size:
+                current_level = 1
+            elif title_font_size <= min_extreme_font_size:
+                current_level = 3
+            elif float(title_font_size) >= float(level_threshold):
+                current_level = 2
+            else:
+                current_level = 3
+            # print(f"After adjustment by font size, {current_level}")
+            title_block["block_title_level"] = current_level
+        return title_blocks
+    def batch_process_blocks_recog_title_level(self, pdf_dic):
+        title_blocks = []
+        # Collect all titles
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = blocks.get("para_blocks", [])
+                for block in para_blocks:
+                    if block.get("is_block_title"):
+                        title_obj = {"page_id": page_id, "block": block}
+                        title_blocks.append(title_obj)
+        # Determine title level
+        if title_blocks:
+            # Determine title level based on font size
+            title_blocks = self.__determine_size_based_level(title_blocks)
+        return pdf_dic
--- a/pdf2json_infer.py
+++ b/pdf2json_infer.py
+import sys
+from typing import Tuple
+import os
+import click
+import boto3, json
+from botocore.config import Config
+from libs.commons import fitz
+from loguru import logger
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+# sys.path.insert(0, "/mnt/petrelfs/ouyanglinke/code-clean/")
+# print(sys.path)
+from validation import cal_edit_distance, format_gt_bbox, label_match, detect_val
+# from pdf2text_recogFigure_20231107 import parse_images        # 获取figures的bbox
+# from pdf2text_recogTable_20231107 import parse_tables         # 获取tables的bbox
+# from pdf2text_recogEquation_20231108 import parse_equations    # 获取equations的bbox
+# from pdf2text_recogTitle_20231113 import parse_titles           # 获取Title的bbox
+# from pdf2text_recogPara import parse_blocks_per_page    
+# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
+from layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
+from pdf2text_recogFigure import parse_images          # 获取figures的bbox
+from pdf2text_recogTable import parse_tables           # 获取tables的bbox
+from pdf2text_recogEquation import parse_equations     # 获取equations的bbox
+from pdf2text_recogTitle import parse_titles           # 获取titles的bbox
+from pdf2text_recogHeader import parse_headers         # 获取headers的bbox
+from pdf2text_recogPageNo import parse_pageNos         # 获取pageNos的bbox
+# from pdf2text_recogFootnote import parse_footnotes     # 获取footnotes的bbox
+from pdf2text_recogFooter import parse_footers         # 获取footers的bbox
+from pdf2text_evaluatePdfLayout import evaluate_pdf_layout # 评估页面的Layout是否是规整的。
+from pdf2text_recogPara import process_blocks_per_page, postprocess_paras_pipeline
+from libs.commons import parse_aws_param, parse_bucket_key, read_file, join_path
+def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str):
+    """
+    从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径
+    save_path：需要同时支持s3和本地, 图片存放在save_path下，文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
+    """
+    # 拼接路径
+    image_save_path = join_path(save_parent_path, f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg")
+    try:
+        # 将坐标转换为fitz.Rect对象
+        rect = fitz.Rect(*bbox)
+        # 配置缩放倍数为3倍
+        zoom = fitz.Matrix(3, 3)
+        # 截取图片
+        pix = page.get_pixmap(clip=rect, matrix=zoom)
+        # 打印图片文件名
+        # print(f"Saved {image_save_path}")
+        if image_save_path.startswith("s3://"):
+            ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                            config=Config(s3={'addressing_style': addressing_style}))
+            bucket_name, bucket_key = parse_bucket_key(image_save_path)
+            # 将字节流上传到s3
+            cli.upload_fileobj(pix.tobytes(output='jpeg', jpg_quality=95), bucket_name, bucket_key)
+        else:
+            # 保存图片到本地
+            # 先检查一下image_save_path的父目录是否存在，如果不存在，就创建
+            parent_dir = os.path.dirname(image_save_path)
+            if not os.path.exists(parent_dir):
+                os.makedirs(parent_dir)
+            pix.save(image_save_path, jpg_quality=95)
+            # 为了直接能在markdown里看，这里把地址改为相对于mardown的地址
+            pth = Path(image_save_path)
+            image_save_path =  f"{pth.parent.name}/{pth.name}"
+            return image_save_path
+    except Exception as e:
+        logger.exception(e)
+        return image_save_path
+def get_images_by_bboxes(book_name:str, page_num:int, page: fitz.Page, save_path:str, s3_profile:str, image_bboxes:list, table_bboxes:list, equation_inline_bboxes:list, equation_interline_bboxes:list) -> dict:
+    """
+    返回一个dict, key为bbox, 值是图片地址
+    """
+    ret = {}
+    # 图片的保存路径组成是这样的： {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
+    image_save_path = join_path(save_path, book_name, "images") 
+    table_save_path = join_path(save_path, book_name, "tables") 
+    equation_inline_save_path = join_path(save_path, book_name, "equations_inline")
+    equation_interline_save_path = join_path(save_path, book_name, "equation_interline")
+    for bbox in image_bboxes:
+        image_path = cut_image(bbox, page_num, page, image_save_path, s3_profile)
+        ret[bbox] = (image_path, "image") # 第二个元素是"image"，表示是图片
+    for bbox in table_bboxes:
+        image_path = cut_image(bbox, page_num, page, table_save_path, s3_profile)
+        ret[bbox] = (image_path, "table")
+    # 对公式目前只截图，不返回
+    for bbox in equation_inline_bboxes:
+        cut_image(bbox, page_num, page, equation_inline_save_path, s3_profile)
+    for bbox in equation_interline_bboxes:
+        cut_image(bbox, page_num, page, equation_interline_save_path, s3_profile)
+    return ret
+def reformat_bboxes(images_box_path_dict:list, paras_dict:dict):
+    """
+    把bbox重新组装成一个list，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是段落的内容
+    """
+    all_bboxes = []
+    for bbox, image_info in images_box_path_dict.items():
+        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], image_info, None, None, 'image'])
+    paras_dict = paras_dict[f"page_{paras_dict['page_id']}"]
+    for block_id, kvpair in paras_dict.items():
+        bbox = kvpair['bbox']
+        content = kvpair
+        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], content, None, None, 'text'])
+    return all_bboxes
+def concat2markdown(all_bboxes:list):
+    """
+    对排序后的bboxes拼接内容
+    """
+    content_md = ""
+    for box in all_bboxes:
+        content_type = box[CONTENT_TYPE_IDX]
+        if content_type == 'image':
+            image_type = box[CONTENT_IDX][1]
+            image_path = box[CONTENT_IDX][0]
+            content_md += f"![{image_type}]({image_path})"
+            content_md += "\n\n"
+        elif content_type == 'text': # 组装文本
+            paras = box[CONTENT_IDX]['paras']
+            text_content = ""
+            for para_id, para in paras.items():# 拼装内部的段落文本
+                text_content += para['text']
+                text_content += "\n\n"
+            content_md += text_content
+        else:
+            raise Exception(f"ERROR: {content_type} is not supported!")
+    return content_md
+def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path:str, pdf_model_profile:str, save_path: str, page_num: int):
+    """
+    """
+    pth = Path(s3_pdf_path)
+    book_name = pth.name
+    #book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
+    res_dir_path = None
+    exclude_bboxes = []
+    # text_content_save_path = f"{save_path}/{book_name}/book.md"
+    # metadata_save_path = f"{save_path}/{book_name}/metadata.json"  
+    try:
+        pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
+        pdf_docs = fitz.open("pdf", pdf_bytes)
+        page_id = page_num - 1
+        page = pdf_docs[page_id] # 验证集只需要读取特定页面即可
+        model_output_json = join_path(pdf_model_path, f"page_{page_num}.json") # 模型输出的页面编号从1开始的
+        json_from_docx = read_file(model_output_json, pdf_model_profile) # TODO 这个读取方法名字应该改一下，避免语义歧义
+        json_from_docx_obj = json.loads(json_from_docx)
+        # 解析图片
+        image_bboxes = parse_images(page_id, page, json_from_docx_obj)
+        # 解析表格
+        table_bboxes = parse_tables(page_id, page, json_from_docx_obj)
+        # 解析公式
+        equations_interline_bboxes, equations_inline_bboxes = parse_equations(page_id, page, json_from_docx_obj)
+        # # 解析标题
+        # title_bboxs = parse_titles(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
+        # # 解析页眉
+        # header_bboxs = parse_headers(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
+        # # 解析页码
+        # pageNo_bboxs = parse_pageNos(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
+        # # 解析脚注
+        # footnote_bboxs = parse_footnotes(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
+        # # 解析页脚
+        # footer_bboxs = parse_footers(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
+        # # 评估Layout是否规整、简单
+        # isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
+        # 把图、表、公式都进行截图，保存到本地，返回图片路径作为内容
+        images_box_path_dict = get_images_by_bboxes(book_name, page_id, page, save_path, s3_pdf_profile, image_bboxes, table_bboxes, equations_inline_bboxes,
+                                                    equations_interline_bboxes)  # 只要表格和图片的截图
+        # 解析文字段落
+        footer_bboxes = []
+        header_bboxes = []
+        exclude_bboxes = image_bboxes + table_bboxes
+        paras_dict = process_blocks_per_page(page, page_id, image_bboxes, table_bboxes, equations_inline_bboxes, equations_interline_bboxes, footer_bboxes, header_bboxes)
+        # paras_dict = postprocess_paras_pipeline(paras_dict)
+        # 最后一步，根据bbox进行从左到右，从上到下的排序，之后拼接起来, 排序
+        all_bboxes = reformat_bboxes(images_box_path_dict, paras_dict)  # 由于公式目前还没有，所以equation_bboxes是None，多数存在段落里，暂时不解析
+        # 返回的是一个数组，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, type], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是段落的内容
+        # sorted_bboxes = bbox_sort(all_bboxes)
+        # markdown_text = concat2markdown(sorted_bboxes)
+        # parent_dir = os.path.dirname(text_content_save_path)
+        # if not os.path.exists(parent_dir):
+        #     os.makedirs(parent_dir)
+        # with open(text_content_save_path, "a") as f:
+        #     f.write(markdown_text)
+        #     f.write(chr(12)) #换页符   
+        # end for
+        # 写一个小的json,记录元数据
+        # metadata = {"book_name": book_name, "pdf_path": s3_pdf_path, "pdf_model_path": pdf_model_path, "save_path": save_path}
+        # with open(metadata_save_path, "w") as f:
+        #     json.dump(metadata, f, ensure_ascii=False, indent=4)
+        return all_bboxes
+    except Exception as e:
+        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
+        logger.exception(e)
+# @click.command()
+# @click.option('--pdf-file-sub-path', help='s3上pdf文件的路径')
+# @click.option('--save-path', help='解析出来的图片，文本的保存父目录')
+def validation(validation_dataset: str, pdf_bin_file_profile: str, pdf_model_dir: str, pdf_model_profile: str, save_path: str):
+    #pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
+    # pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
+    # pdf_model_parent_dir = "s3://llm-pdf-text/layout_det/scihub/"
+    # p = Path(pdf_file_sub_path)
+    # pdf_parent_path = p.parent
+    # pdf_file_name = p.name   # pdf文件名字，含后缀
+    # pdf_bin_file_path  = join_path(pdf_bin_file_parent_path, pdf_parent_path)
+    with open(validation_dataset, 'r') as f:
+        samples = json.load(f)
+    labels = []
+    det_res = []
+    edit_distance_list = []
+    for sample in tqdm(samples):
+        pdf_name = sample['pdf_name']
+        s3_pdf_path = sample['s3_path']
+        page_num = sample['page']
+        gt_order = sample['order']
+        pre = main(s3_pdf_path, pdf_bin_file_profile, join_path(pdf_model_dir, pdf_name), pdf_model_profile, save_path, page_num)
+        pre_dict_list = []
+        for item in pre:
+            pre_sample = {
+                'box': [item[0],item[1],item[2],item[3]],
+                'type': item[7],
+                'score': 1
+            }
+            pre_dict_list.append(pre_sample)
+        det_res.append(pre_dict_list)
+        match_change_dict = {   # 待确认
+            "figure": "image",
+            "svg_figure": "image",
+            "inline_fomula": "equations_inline",
+            "fomula": "equation_interline",
+            "figure_caption": "text",
+            "table_caption": "text",
+            "fomula_caption": "text"
+        }
+        gt_annos = sample['annotations']
+        matched_label = label_match(gt_annos, match_change_dict)
+        labels.append(matched_label)
+        # 判断排序函数的精度
+        # 目前不考虑caption与图表相同序号的问题
+        ignore_category = ['abandon', 'figure_caption', 'table_caption', 'formula_caption'] 
+        gt_bboxes = format_gt_bbox(gt_annos, ignore_category)
+        sorted_bboxes = bbox_sort(gt_bboxes)
+        edit_distance = cal_edit_distance(sorted_bboxes)
+        edit_distance_list.append(edit_distance)
+    label_classes = ["image", "text", "table", "equation_interline"]
+    detect_matrix = detect_val(labels, det_res, label_classes)
+    print('detect_matrix', detect_matrix)
+    edit_distance_mean = np.mean(edit_distance_list)
+    print('edit_distance_mean', edit_distance_mean)
+if __name__ == '__main__':
+    # 输入可以用以下命令生成批量pdf
+    # aws s3 ls s3://llm-pdf-text/layout_det/scihub/ --profile langchao | tail -n 10 | awk '{print "s3://llm-pdf-text/layout_det/scihub/"$4}' | xargs -I{}  aws s3 ls {} --recursive --profile langchao  | awk '{print substr($4,19)}' | parallel -j 1 echo {//} | sort -u
+    pdf_bin_file_profile = "outsider"
+    pdf_model_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
+    pdf_model_profile = "langchao"
+    # validation_dataset = "/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json"
+    validation_dataset = "/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_subset.json" # 测试
+    save_path = "/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_val_result"
+    validation(validation_dataset, pdf_bin_file_profile, pdf_model_dir, pdf_model_profile, save_path)
--- a/pdf2md.py
+++ b/pdf2md.py
+import os
+import sys
+from pathlib import Path
+import click
+import json
+from loguru import logger
+from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
+from mkcontent import mk_mm_markdown, mk_nlp_markdown
+from pdf_parse_by_model import parse_pdf_by_model
+def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_profile: str, start_page_num=0, debug_mode=True):
+    """ """
+    pth = Path(s3_pdf_path)
+    book_name = pth.name
+    # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "..","tmp", "unittest") 
+    save_path = join_path(save_tmp_path, "md")
+    text_content_save_path = f"{save_path}/{book_name}/book.md"
+    # metadata_save_path = f"{save_path}/{book_name}/metadata.json"
+    try:
+        paras_dict = parse_pdf_by_model(
+            s3_pdf_path, s3_pdf_profile, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
+        )
+        parent_dir = os.path.dirname(text_content_save_path)
+        if not os.path.exists(parent_dir):
+            os.makedirs(parent_dir)
+        if not paras_dict.get('need_drop'):
+            markdown_content = mk_mm_markdown(paras_dict)
+        else:
+            markdown_content = paras_dict['drop_reason']
+        with open(text_content_save_path, "w", encoding="utf-8") as f:
+            f.write(markdown_content)
+    except Exception as e:
+        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
+        logger.exception(e)
+@click.command()
+@click.option("--pdf-file-path", help="s3上pdf文件的路径")
+@click.option("--save-path", help="解析出来的图片，文本的保存父目录")
+def main_shell(pdf_file_path: str, save_path: str):
+    # pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
+    pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
+    pdf_bin_file_profile = "s2"
+    pdf_model_parent_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
+    pdf_model_profile = "langchao"
+    p = Path(pdf_file_path)
+    pdf_parent_path = p.parent
+    pdf_file_name = p.name  # pdf文件名字，含后缀
+    pdf_bin_file_path = join_path(pdf_bin_file_parent_path, pdf_parent_path)
+    pdf_model_dir = join_path(pdf_model_parent_dir, pdf_parent_path)
+    main(
+        join_path(pdf_bin_file_path, pdf_file_name),
+        pdf_bin_file_profile,
+        join_path(pdf_model_dir, pdf_file_name),
+        pdf_model_profile,
+        save_path,
+    )
+@click.command()
+@click.option("--pdf-dir", help="s3上pdf文件的路径")
+@click.option("--model-dir", help="s3上pdf文件的路径")
+@click.option("--start-page-num", default=0, help="从第几页开始解析")
+def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
+    # 先扫描所有的pdf目录里的文件名字
+    pdf_dir = Path(pdf_dir)
+    model_dir = Path(model_dir)
+    if pdf_dir.is_file():
+        pdf_file_names = [pdf_dir.name]
+        pdf_dir = pdf_dir.parent
+    else:
+        pdf_file_names = [f.name for f in pdf_dir.glob("*.pdf")]
+    for pdf_file in pdf_file_names:
+        pdf_file_path = os.path.join(pdf_dir, pdf_file)
+        model_file_path = os.path.join(model_dir, pdf_file)
+        main(pdf_file_path, None, model_file_path, None, start_page_num)
+if __name__ == "__main__":
+    main_shell2()
--- a/pdf2text_evaluatePdfLayout.py
+++ b/pdf2text_evaluatePdfLayout.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
+    # 计算两个rect，重叠面积各占2个rect面积的比例
+    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
+        return 0, 0
+    square_1 = (R1 - L1) * (D1 - U1)
+    square_2 = (R2 - L2) * (D2 - U2)
+    if square_1 == 0 or square_2 == 0:
+        return 0, 0
+    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
+    return square_overlap / square_1, square_overlap / square_2
+def evaluate_pdf_layout(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 title ---------#
+    title_bbox_from_DocXChain = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    LOSS_THRESHOLD = 2000               # 经验值
+    fullColumn_bboxs = []
+    subColumn_bboxs = []
+    plainText_bboxs = []
+    #### read information of plain text
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 2:
+            plainText_bboxs.append((L, U, R, D))
+    #### read information of column
+    for xf in xf_json['subfield_dets']:
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 11:
+            fullColumn_bboxs.append((L, U, R, D))
+        elif xf['category_id'] == 12:
+            subColumn_bboxs.append((L, U, R, D))
+    curPage_loss = 0        # 当前页的loss
+    fail_cnt = 0            # Text文本块没被圈到的情形。
+    for L, U, R, D in plainText_bboxs:
+        find = False
+        for L2, U2, R2, D2 in (fullColumn_bboxs + subColumn_bboxs):
+            ratio_1, _ = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, L2, U2, R2, D2)
+            if ratio_1 >= 0.9:
+                loss_1 = (L + R) / 2 - (L2 + R2) / 2
+                loss_2 = L - L2
+                cur_loss = min(abs(loss_1), abs(loss_2))
+                curPage_loss += cur_loss
+                find = True
+                break
+        if find == False:
+            fail_cnt += 1
+    isSimpleLayout_flag = False
+    if fail_cnt == 0 and len(fullColumn_bboxs) <= 1 and len(subColumn_bboxs) <= 2:
+        if curPage_loss <= LOSS_THRESHOLD:
+            isSimpleLayout_flag  = True
+    return isSimpleLayout_flag, len(fullColumn_bboxs), len(subColumn_bboxs), curPage_loss
--- a/pdf2text_getNumberOfColumn.py
+++ b/pdf2text_getNumberOfColumn.py
+from libs.commons import fitz
+from typing import List
+def show_image(item, title=""):
+    """Display a pixmap.
+    Just to display Pixmap image of "item" - ignore the man behind the curtain.
+    Args:
+        item: any PyMuPDF object having a "get_pixmap" method.
+        title: a string to be used as image title
+    Generates an RGB Pixmap from item using a constant DPI and using matplotlib
+    to show it inline of the notebook.
+    """
+    DPI = 150  # use this resolution
+    import numpy as np
+    import matplotlib.pyplot as plt
+    # %matplotlib inline
+    pix = item.get_pixmap(dpi=DPI)
+    img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
+    plt.figure(dpi=DPI)  # set the figure's DPI
+    plt.title(title)  # set title of image
+    _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))
+def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
+    # 计算两个line，重叠line各占2个line长度的比例
+    if max(L1, L2) > min(R1, R2):
+        return 0, 0
+    if L1 == R1 or L2 == R2:
+        return 0, 0
+    overlap_line = min(R1, R2) - max(L1, L2)
+    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
+def get_targetAxis_and_splitAxis(page_ID: int, page: fitz.Page, columnNumber: int, textBboxs: List[(float, float, float, float)]) -> (List[float], List[float]):
+    """
+    param: page: fitz解析出来的格式
+    param: columnNumber: Text的列数
+    param: textBboxs: 文本块list。 [(L, U, R, D), ... ]
+    return: 
+    """
+    INF = 10 ** 9
+    pageL, pageU, pageR, pageD = INF, INF, 0, 0
+    for L, U, R, D in textBboxs:
+        assert L <= R and U <= D
+        pageL = min(pageL, L)
+        pageR = max(pageR, R)
+        pageU = min(pageU, U)
+        pageD = max(pageD, D)
+    pageWidth = pageR - pageL
+    pageHeight = pageD - pageU
+    pageL -= pageWidth / 10  # 10是经验值
+    pageR += pageWidth / 10
+    pageU -= pageHeight / 10
+    pageD += pageHeight / 10
+    pageWidth = pageR - pageL
+    pageHeight = pageD - pageU
+    x_targetAxis = []
+    x_splitAxis = []
+    for i in range(0, columnNumber * 2 + 1):
+        if i & 1:
+            x_targetAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
+        else:
+            x_splitAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
+    # # 可视化：分列的外框
+    # path_bbox = []
+    # N = len(x_targetAxis)
+    # for i in range(N):
+    #     L, R = x_splitAxis[i], x_splitAxis[i + 1]
+    #     path_bbox.append((L, pageU, R, pageD))
+    # shape = page.new_shape()
+    # # iterate over the bboxes
+    # color_map = [fitz.pdfcolor["red"], fitz.pdfcolor["blue"], fitz.pdfcolor["yellow"], fitz.pdfcolor["black"], fitz.pdfcolor["green"], fitz.pdfcolor["brown"]]
+    # for i, rect in enumerate(path_bbox):
+    #     # if i < 20:
+    #     #     continue
+    #     shape.draw_rect(rect)  # draw a border
+    #     shape.insert_text(Point(rect[0], rect[1])+(5, 15), str(i), color=fitz.pdfcolor["blue"])
+    #     shape.finish(color=color_map[i%len(color_map)])
+    #     # shape.finish(color=fitz.pdfcolor["blue"])
+    #     shape.commit()  # store to the page
+    #     # if i == 3:
+    #     #     print(rect)
+    #     #     break
+    #     # print(rect)
+    # show_image(page, f"Table & Header BBoxes")            
+    return x_targetAxis, x_splitAxis
+def calculate_loss(page_ID: int, x_targetAxis: List[float], x_splitAxis: List[float], textBboxs: List[(float, float, float, float)]) -> (float, bool):
+    INF = 10 ** 9
+    # page_artbox = page.artbox
+    # pageL, pageU, pageR, pageD = page_artbox[0], page_artbox[1], page_artbox[2], page_artbox[3]
+    pageL, pageU, pageR, pageD = INF, INF, 0, 0
+    for L, U, R, D in textBboxs:
+        assert L <= R and U <= D
+        pageL = min(pageL, L)
+        pageR = max(pageR, R)
+        pageU = min(pageU, U)
+        pageD = max(pageD, D)
+    pageWidth = pageR - pageL
+    pageHeight = pageD - pageU
+    pageL -= pageWidth / 10
+    pageR += pageWidth / 10
+    pageU -= pageHeight / 10
+    pageD += pageHeight / 10
+    pageWidth = pageR - pageL
+    pageHeight = pageD - pageU
+    col_N = len(x_targetAxis)  # 列数
+    col_texts_mid = [[] for _ in range(col_N)]
+    col_texts_LR = [[] for _ in range(col_N)]
+    oneLocateLoss_mid = 0
+    oneLocateLoss_LR = 0
+    oneLocateCnt_mid = 0  # 完美在一列中的个数
+    oneLocateCnt_LR = 0
+    oneLocateSquare_mid = 0.0  # 完美在一列的面积
+    oneLocateSquare_LR = 0.0
+    multiLocateLoss_mid = 0
+    multiLocateLoss_LR = 0
+    multiLocateCnt_mid = 0  # 在多列中的个数
+    multiLocateCnt_LR = 0
+    multiLocateSquare_mid = 0.0  # 在多列中的面积
+    multiLocateSquare_LR = 0.0
+    allLocateLoss_mid = 0
+    allLocateLoss_LR = 0
+    allLocateCnt_mid = 0  # 横跨页面的大框的个数
+    allLocateCnt_LR = 0
+    allLocateSquare_mid = 0.0  # 横跨整个页面的个数
+    allLocateSquare_LR = 0.0
+    isSimpleCondition = True  # 就1个。2种方式，只要有一种情况不规整，就是不规整。
+    colID_Textcnt_mid = [0 for _ in range(col_N)]  # 每一列中有多少个Text块，根据mid判断的
+    colID_Textcnt_LR = [0 for _ in range(col_N)]  # 每一列中有多少个Text块，根据区间边界判断
+    allLocateBboxs_mid = []  # 跨整页的，bbox
+    allLocateBboxs_LR = []
+    non_allLocateBboxs_mid = []
+    non_allLocateBboxs_LR = []  # 不在单独某一列，但又不是全列
+    for L, U, R, D in textBboxs:
+        if D - U < 40:  # 现在还没拼接好。先简单这样过滤页眉。也会牺牲一些很窄的长条
+            continue
+        if R - L < 40:
+            continue
+        located_cols_mid = []
+        located_cols_LR = []
+        for col_ID in range(col_N):
+            if col_N == 1:
+                located_cols_mid.append(col_ID)
+                located_cols_LR.append(col_ID)
+            else:
+                if L <= x_targetAxis[col_ID] <= R:
+                    located_cols_mid.append(col_ID)
+                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
+                    located_cols_LR.append(col_ID)
+        if len(located_cols_mid) == col_N:
+            allLocateBboxs_mid.append((L, U, R, D))
+        else:
+            non_allLocateBboxs_mid.append((L, U, R, D))
+        if len(located_cols_LR) == col_N:
+            allLocateBboxs_LR.append((L, U, R, D))
+        else:
+            non_allLocateBboxs_LR.append((L, U, R, D))
+    allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
+    non_allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
+    allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
+    non_allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
+    # --------------------判断，是不是有标题类的小块，掺杂在一列的pdf页面里。-------------#
+    isOneClumn = False
+    under_cnt = 0
+    under_square = 0.0
+    before_cnt = 0
+    before_square = 0.0
+    for nL, nU, nR, nD in non_allLocateBboxs_mid:
+        cnt = 0
+        for L, U, R, D in allLocateBboxs_mid:
+            if nD <= U:
+                cnt += 1
+        if cnt >= 1:
+            before_cnt += cnt
+            before_square += (R - L) * (D - U) * cnt
+        else:
+            under_cnt += 1
+            under_square += (R - L) * (D - U) * cnt
+    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
+        isOneClumn = True
+    if isOneClumn == True and col_N != 1:
+        return INF, False
+    if isOneClumn == True and col_N == 1:
+        return 0, True
+    #### 根据边界的统计情况，再判断一次
+    isOneClumn = False
+    under_cnt = 0
+    under_square = 0.0
+    before_cnt = 0
+    before_square = 0.0
+    for nL, nU, nR, nD in non_allLocateBboxs_LR:
+        cnt = 0
+        for L, U, R, D in allLocateBboxs_LR:
+            if nD <= U:
+                cnt += 1
+        if cnt >= 1:
+            before_cnt += cnt
+            before_square += (R - L) * (D - U) * cnt
+        else:
+            under_cnt += 1
+            under_square += (R - L) * (D - U) * cnt
+    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
+        isOneClumn = True
+    if isOneClumn == True and col_N != 1:
+        return INF, False
+    if isOneClumn == True and col_N == 1:
+        return 0, True
+    for L, U, R, D in textBboxs:
+        assert L < R and U < D, 'There is an error on bbox of text when calculate loss!'
+        # 简单排除页眉、迷你小块
+        # if (D - U) < pageHeight / 15 < 40 or (R - L) < pageWidth / 8:
+        if (D - U) < 40:
+            continue
+        if (R - L) < 40:
+            continue
+        mid = (L + R) / 2
+        located_cols_mid = []  # 在哪一列里，根据中点来判断
+        located_cols_LR = []  # 在哪一列里，根据边界判断
+        for col_ID in range(col_N):
+            if col_N == 1:
+                located_cols_mid.append(col_ID)
+            else:
+                # 根据中点判断
+                if L <= x_targetAxis[col_ID] <= R:
+                    located_cols_mid.append(col_ID)
+                # 根据边界判断
+                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
+                    located_cols_LR.append(col_ID)
+        ## 1列的情形
+        if col_N == 1:
+            oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
+            # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
+            oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_mid[0]]) * (D - U) * (R - L)
+            oneLocateCnt_mid += 1
+            oneLocateSquare_mid += (D - U) * (R - L)
+        ## 多列的情形
+        else:
+            ######## 根据mid判断
+            if len(located_cols_mid) == 1:
+                oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
+                # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
+                oneLocateCnt_mid += 1
+                oneLocateSquare_mid += (D - U) * (R - L)
+            elif 1 <= len(located_cols_mid) < col_N:
+                ll, rr = located_cols_mid[0], located_cols_mid[-1]
+                # multiLocateLoss_mid += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
+                multiLocateLoss_mid += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
+                # multiLocateLoss_mid += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
+                multiLocateCnt_mid += 1
+                multiLocateSquare_mid += (D - U) * (R - L)
+                isSimpleCondition = False
+            else:
+                allLocateLoss_mid += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
+                allLocateCnt_mid += 1
+                allLocateSquare_mid += (D - U) * (R - L)
+                isSimpleCondition = False
+            ######## 根据区间的边界判断
+            if len(located_cols_LR) == 1:
+                oneLocateLoss_LR += abs(mid - x_targetAxis[located_cols_LR[0]]) * (D - U) * (R - L)
+                # oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_LR[0]]) * (D - U) * (R - L)
+                oneLocateCnt_LR += 1
+                oneLocateSquare_LR += (D - U) * (R - L)
+            elif 1 <= len(located_cols_LR) < col_N:
+                ll, rr = located_cols_LR[0], located_cols_LR[-1]
+                # multiLocateLoss_LR += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
+                multiLocateLoss_LR += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
+                # multiLocateLoss_LR += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
+                multiLocateCnt_LR += 1
+                multiLocateSquare_LR += (D - U) * (R - L)
+                isSimpleCondition = False
+            else:
+                allLocateLoss_LR += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
+                allLocateCnt_LR += 1
+                allLocateSquare_LR += (D - U) * (R - L)
+                isSimpleCondition = False
+    tot_TextCnt = oneLocateCnt_mid + multiLocateCnt_mid + allLocateCnt_mid
+    tot_TextSquare = oneLocateSquare_mid + multiLocateSquare_mid + allLocateSquare_mid
+    # 1列的情形
+    if tot_TextSquare != 0 and allLocateSquare_mid / tot_TextSquare >= 0.85 and col_N == 1:
+        return 0, True
+    # 多列的情形
+    # if col_N >= 2:
+    #     if allLocateCnt >= 1:
+    #         oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
+    #         multiLocateLoss_mid += ((pageR - pageL) ) * multiLocateCnt_mid
+    #     else:
+    #         if multiLocateCnt_mid >= 1:
+    #             oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
+    totLoss_mid = oneLocateLoss_mid + multiLocateLoss_mid + allLocateLoss_mid
+    totLoss_LR = oneLocateCnt_LR + multiLocateCnt_LR + allLocateLoss_LR
+    return totLoss_mid + totLoss_LR, isSimpleCondition
+def get_columnNumber(page_ID: int, page: fitz.Page, textBboxs) -> (int, float):
+    columnNumber_loss = dict()
+    columnNumber_isSimpleCondition = dict()
+    #### 枚举列数
+    for columnNumber in range(1, 5):
+        # print('---------{}--------'.format(columnNumber))
+        x_targetAxis, x_splitAxis = get_targetAxis_and_splitAxis(page_ID, page, columnNumber, textBboxs)
+        loss, isSimpleCondition = calculate_loss(page_ID, x_targetAxis, x_splitAxis, textBboxs)
+        columnNumber_loss[columnNumber] = loss
+        columnNumber_isSimpleCondition[columnNumber] = isSimpleCondition
+    col_idxs = [i for i in range(1, len(columnNumber_loss) + 1)]
+    col_idxs.sort(key=lambda i: (columnNumber_loss[i], i))
+    return col_idxs, columnNumber_loss, columnNumber_isSimpleCondition
--- a/pdf2text_recogEquation.py
+++ b/pdf2text_recogEquation.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+from pathlib import Path
+def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 table ---------#
+    equationEmbedding_from_DocXChain_bboxs = []
+    equationIsolated_from_DocXChain_bboxs = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # equation
+        img_suffix = f"{page_ID}_{int(L)}_{int(U)}_{int(R)}_{int(D)}"
+        if xf['category_id'] == 13 and xf['score'] >= 0.3:      
+            latex_text = xf.get("latex", "EmptyInlineEquationResult")
+            debugable_latex_text = f"{latex_text}|{img_suffix}"
+            equationEmbedding_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
+        if xf['category_id'] == 14 and xf['score'] >= 0.3:
+            latex_text = xf.get("latex", "EmptyInterlineEquationResult")
+            debugable_latex_text = f"{latex_text}|{img_suffix}"
+            equationIsolated_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
+    #---------------------------------------- 排序，编号，保存 -----------------------------------------#
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    equationEmbedding_from_DocXChain_names = []
+    equationEmbedding_ID = 0
+    equationIsolated_from_DocXChain_names = []
+    equationIsolated_ID = 0
+    for L, U, R, D, _ in equationEmbedding_from_DocXChain_bboxs:
+        if not(L < R and U < D):
+            continue
+        try:
+            # cur_equation = page.get_pixmap(clip=(L,U,R,D))
+            new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationEmbedding_ID)        # 公式name
+            # cur_equation.save(res_dir_path + '/' + new_equation_name)                       # 把公式存出在新建的文件夹，并命名
+            equationEmbedding_from_DocXChain_names.append(new_equation_name)                         # 把公式的名字存在list中，方便在md中插入引用
+            equationEmbedding_ID += 1
+        except:
+            pass
+    for L, U, R, D, _ in equationIsolated_from_DocXChain_bboxs:
+        if not(L < R and U < D):
+            continue
+        try:
+            # cur_equation = page.get_pixmap(clip=(L,U,R,D))
+            new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationIsolated_ID)        # 公式name
+            # cur_equation.save(res_dir_path + '/' + new_equation_name)                       # 把公式存出在新建的文件夹，并命名
+            equationIsolated_from_DocXChain_names.append(new_equation_name)                         # 把公式的名字存在list中，方便在md中插入引用
+            equationIsolated_ID += 1
+        except:
+            pass
+    equationEmbedding_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    """根据pdf可视区域，调整bbox的坐标"""
+    cropbox = page.cropbox
+    if cropbox[0]!=page.rect[0] or cropbox[1]!=page.rect[1]:
+        for eq_box in equationEmbedding_from_DocXChain_bboxs:
+            eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
+        for eq_box in equationIsolated_from_DocXChain_bboxs:
+            eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
+    return equationEmbedding_from_DocXChain_bboxs, equationIsolated_from_DocXChain_bboxs
--- a/pdf2text_recogFigure.py
+++ b/pdf2text_recogFigure.py
+import os                   
+import collections      # 统计库
+import re
+from libs.boxbase import _is_in_or_part_overlap               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+#--------------------------------------- Tool Functions --------------------------------------#
+# 正则化，输入文本，输出只保留a-z,A-Z,0-9
+def remove_special_chars(s: str) -> str:
+    pattern = r"[^a-zA-Z0-9]"
+    res = re.sub(pattern, "", s)
+    return res
+def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1和rect2是否一模一样
+    return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
+def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1包含了rect2
+    return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
+def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1与rect2是否存在重叠（只有一条边重叠，也算重叠）
+    return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
+def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
+    # 计算两个rect，重叠面积各占2个rect面积的比例
+    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
+        return 0, 0
+    square_1 = (R1 - L1) * (D1 - U1)
+    square_2 = (R2 - L2) * (D2 - U2)
+    if square_1 == 0 or square_2 == 0:
+        return 0, 0
+    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
+    return square_overlap / square_1, square_overlap / square_2
+def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
+    # 计算两个line，重叠区间各占2个line长度的比例
+    if max(L1, L2) > min(R1, R2):
+        return 0, 0
+    if L1 == R1 or L2 == R2:
+        return 0, 0
+    overlap_line = min(R1, R2) - max(L1, L2)
+    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
+# 判断rect其实是一条line
+def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
+    width = R - L
+    height = D - U
+    if width <= 3 or height <= 3:
+        return True
+    if width / height >= 30 or height / width >= 30:
+        return True
+def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    #### 通过fitz获取page信息
+    ## 超越边界
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #----------------- 保存每一个文本块的LURD ------------------#
+    textLine_blocks = []
+    blocks = page.get_text(
+            "dict",
+            flags=fitz.TEXTFLAGS_TEXT,
+            #clip=clip,
+        )["blocks"]
+    for i in range(len(blocks)):
+        bbox = blocks[i]['bbox']
+        # print(bbox)
+        for tt in blocks[i]['lines']:
+            # 当前line
+            cur_line_bbox = None                            # 当前line，最右侧的section的bbox
+            for xf in tt['spans']:
+                L, U, R, D = xf['bbox']
+                L, R = min(L, R), max(L, R)
+                U, D = min(U, D), max(U, D)
+                textLine_blocks.append((L, U, R, D))
+    textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    #---------------------------------------------- 保存img --------------------------------------------------#
+    raw_imgs = page.get_images()                    # 获取所有的图片
+    imgs = []
+    img_names = []                              # 保存图片的名字，方便在md中插入引用
+    img_bboxs = []                              # 保存图片的location信息。
+    img_visited = [] # 记忆化，记录该图片是否在md中已经插入过了
+    img_ID = 0
+    ## 获取、保存每张img的location信息(x1, y1, x2, y2， UL, DR坐标)
+    for i in range(len(raw_imgs)):
+        # 如果图片在junklist中则跳过
+        if raw_imgs[i][0] in junk_img_bojids:
+            continue
+        else:
+            try:
+                tt = page.get_image_rects(raw_imgs[i][0], transform = True)
+                rec = tt[0][0]
+                L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
+                L, R = min(L, R), max(L, R)
+                U, D = min(U, D), max(U, D)
+                if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
+                    continue
+                if pageL == L and R == pageR:
+                    continue
+                if pageU == U and D == pageD:
+                    continue
+                # pix1 = page.get_Pixmap(clip=(L,U,R,D))
+                new_img_name = "{}_{}.png".format(page_ID, i)      # 图片name
+                # pix1.save(res_dir_path + '/' + new_img_name)        # 把图片存出在新建的文件夹，并命名
+                img_names.append(new_img_name)
+                img_bboxs.append((L, U, R, D))
+                img_visited.append(False)
+                imgs.append(raw_imgs[i])
+            except:
+                continue
+    #-------- 如果img之间有重叠。说明获取的img大小有问题，位置也不一定对。就扔掉--------#
+    imgs_ok = [True for _ in range(len(imgs))]
+    for i in range(len(imgs)):
+        L1, U1, R1, D1 = img_bboxs[i]
+        for j in range(i + 1, len(imgs)):
+            L2, U2, R2, D2 = img_bboxs[j]
+            ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            if ratio_1 > 0 and ratio_2 > 0:
+                if ratio_1 == 1 and ratio_2 > 0.8:
+                    imgs_ok[i] = False
+                elif ratio_1 > 0.8 and ratio_2 == 1:
+                    imgs_ok[j] = False 
+                elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
+                    imgs_ok[i] = False
+                    imgs_ok[j] = False
+                elif s1 / s2 > 5 and ratio_2 > 0.5:
+                    imgs_ok[j] = False
+                elif s2 / s1 > 5 and ratio_1 > 0.5:
+                    imgs_ok[i] = False
+    imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    #*******************************************************************************#
+    #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
+    #
+    svgs = page.get_drawings()
+    #------------ preprocess, check一些大框，看是否是合理的 ----------#
+    ## 去重。有时候会遇到rect1和rect2是完全一样的情形。
+    svg_rect_visited = set()
+    available_svgIdx = []
+    for i in range(len(svgs)):
+        L, U, R, D = svgs[i]['rect'].irect
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        tt = (L, U, R, D)
+        if tt not in svg_rect_visited:
+            svg_rect_visited.add(tt)
+            available_svgIdx.append(i)
+    svgs = [svgs[i] for i in available_svgIdx]                  # 去重后，有效的svgs
+    svg_childs = [[] for _ in range(len(svgs))]
+    svg_parents = [[] for _ in range(len(svgs))]
+    svg_overlaps = [[] for _ in range(len(svgs))]            #svg_overlaps[i]是一个list，存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
+    svg_visited = [False for _ in range(len(svgs))]
+    svg_exceedPage = [0 for _ in range(len(svgs))]       # 是否超越边界（artbox），很大，但一般是一个svg的底。  
+    for i in range(len(svgs)):
+        L, U, R, D = svgs[i]['rect'].irect
+        ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
+        if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
+            if ratio_2 >= 0.7:
+                svg_exceedPage[i] += 4
+        else:
+            if L <= pageL:
+                svg_exceedPage[i] += 1
+            if pageR <= R:
+                svg_exceedPage[i] += 1
+            if U <= pageU:
+                svg_exceedPage[i] += 1
+            if pageD <= D:
+                svg_exceedPage[i] += 1
+    #### 如果有≥2个的超边界的框，就不要手写规则判断svg了。很难写对。
+    if len([x for x in svg_exceedPage if x >= 1]) >= 2:
+        svgs = []
+        svg_childs = []
+        svg_parents = []
+        svg_overlaps = []
+        svg_visited = []
+        svg_exceedPage = []  
+    #---------------------------- build graph ----------------------------#
+    for i, p in enumerate(svgs):
+        L1, U1, R1, D1 = svgs[i]["rect"].irect
+        for j in range(len(svgs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = svgs[j]["rect"].irect
+            ## 包含
+            if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                svg_childs[i].append(j)
+                svg_parents[j].append(i)
+            else:
+                ## 交叉
+                if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                    svg_overlaps[i].append(j)
+    #---------------- 确定最终的svg。连通块儿的外围 -------------------#
+    eps_ERROR = 5                      # 给识别出的svg，四周留白（为了防止pyMuPDF的rect不准）
+    svg_ID = 0        
+    svg_final_names = []
+    svg_final_bboxs = []
+    svg_final_visited = []              # 为下面，text识别左准备。作用同img_visited
+    svg_idxs = [i for i in range(len(svgs))]
+    svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1]))   # 按照面积，从大到小排序
+    for i in svg_idxs:
+        if svg_visited[i] == True:
+            continue
+        svg_visited[i] = True
+        L, U, R, D = svgs[i]['rect'].irect
+        width = R - L
+        height = D - U
+        if check_rect_isLine(L, U, R, D) == True:
+            svg_visited[i] = False
+            continue
+        # if i == 4:
+        #     print(i, L, U, R, D)
+        #     print(svg_parents[i])
+        cur_block_element_cnt = 0               # 当前要判定为svg的区域中，有多少elements，最外围的最大svg框除外。
+        if len(svg_parents[i]) == 0:
+            ## 是个普通框的情形
+            cur_block_element_cnt += len(svg_childs[i])
+            if svg_exceedPage[i] == 0:
+                ## 误差。可能已经包含在某个框里面了
+                neglect_flag = False
+                for pL, pU, pR, pD in svg_final_bboxs:
+                    if pL <= L <= R <= pR and pU <= U <= D <= pD:
+                        neglect_flag = True
+                        break
+                if neglect_flag == True:
+                    continue
+                ## 搜索连通域, bfs+记忆化
+                q = collections.deque()
+                for j in svg_overlaps[i]:
+                    q.append(j)
+                while q:
+                    j = q.popleft()
+                    svg_visited[j] = True
+                    L2, U2, R2, D2 = svgs[j]['rect'].irect
+                    # width2 = R2 - L2
+                    # height2 = D2 - U2
+                    # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
+                    #     continue
+                    L = min(L, L2)
+                    R = max(R, R2)
+                    U = min(U, U2)
+                    D = max(D, D2)
+                    cur_block_element_cnt += 1
+                    cur_block_element_cnt += len(svg_childs[j])
+                    for k in svg_overlaps[j]:
+                        if svg_visited[k] == False and svg_exceedPage[k] == 0:
+                            svg_visited[k] = True
+                            q.append(k)
+            elif svg_exceedPage[i] <= 2:
+                ## 误差。可能已经包含在某个svg_final_bbox框里面了
+                neglect_flag = False
+                for sL, sU, sR, sD in svg_final_bboxs:
+                    if sL <= L <= R <= sR and sU <= U <= D <= sD:
+                        neglect_flag = True
+                        break
+                if neglect_flag == True:
+                    continue
+                L, U, R, D = pageR, pageD, pageL, pageU
+                ## 所有孩子元素的最大边界
+                for j in svg_childs[i]:
+                    if svg_visited[j] == True:
+                        continue
+                    if svg_exceedPage[j] >= 1:
+                        continue
+                    svg_visited[j] = True                       #### 这个位置考虑一下
+                    L2, U2, R2, D2 = svgs[j]['rect'].irect
+                    L = min(L, L2)
+                    R = max(R, R2)
+                    U = min(U, U2)
+                    D = max(D, D2)
+                    cur_block_element_cnt += 1
+            # 如果是条line，就不用保存了
+            if check_rect_isLine(L, U, R, D) == True:
+                continue
+            # 如果当前的svg，连2个elements都没有，就不用保存了
+            if cur_block_element_cnt < 3:
+                continue
+            ## 当前svg，框住了多少文本框。如果框多了，可能就是错了
+            contain_textLineBlock_cnt = 0
+            for L2, U2, R2, D2 in textLine_blocks:
+                if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
+                    contain_textLineBlock_cnt += 1
+            if contain_textLineBlock_cnt >= 10:
+                continue
+            # L -= eps_ERROR * 2
+            # U -= eps_ERROR
+            # R += eps_ERROR * 2
+            # D += eps_ERROR
+            # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
+            # cur_svg = page.get_pixmap(clip=(L,U,R,D))
+            new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID)      # 图片name
+            # cur_svg.save(res_dir_path + '/' + new_svg_name)        # 把图片存出在新建的文件夹，并命名
+            svg_final_names.append(new_svg_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            svg_final_bboxs.append((L, U, R, D))
+            svg_final_visited.append(False)
+            svg_ID += 1
+    ## 识别出的svg，可能有 包含，相邻的情形。需要进一步合并
+    svg_idxs = [i for i in range(len(svg_final_bboxs))]
+    svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0]))   # (U, L)
+    svg_final_names_2 = []
+    svg_final_bboxs_2 = []
+    svg_final_visited_2 = []              # 为下面，text识别左准备。作用同img_visited
+    svg_ID_2 = 0
+    for i in range(len(svg_final_bboxs)):
+        L1, U1, R1, D1 = svg_final_bboxs[i]
+        for j in range(i + 1, len(svg_final_bboxs)):
+            L2, U2, R2, D2 = svg_final_bboxs[j]
+            # 如果 rect1包含了rect2
+            if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                svg_final_visited[j] = True
+                continue
+            # 水平并列
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
+            if ratio_1 >= 0.7 and ratio_2 >= 0.7:
+                if abs(L2 - R1) >= 20:
+                    continue
+                LL = min(L1, L2)
+                UU = min(U1, U2)
+                RR = max(R1, R2)
+                DD = max(D1, D2)
+                svg_final_bboxs[i] = (LL, UU, RR, DD)
+                svg_final_visited[j] = True
+                continue
+            # 竖直并列
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
+            if ratio_1 >= 0.7 and ratio_2 >= 0.7:
+                if abs(U2 - D1) >= 20:
+                    continue
+                LL = min(L1, L2)
+                UU = min(U1, U2)
+                RR = max(R1, R2)
+                DD = max(D1, D2)
+                svg_final_bboxs[i] = (LL, UU, RR, DD)
+                svg_final_visited[j] = True
+    for i in range(len(svg_final_bboxs)):
+        if svg_final_visited[i] == False:
+            L, U, R, D = svg_final_bboxs[i]
+            svg_final_bboxs_2.append((L, U, R, D))
+            L -= eps_ERROR * 2
+            U -= eps_ERROR
+            R += eps_ERROR * 2
+            D += eps_ERROR
+            # cur_svg = page.get_pixmap(clip=(L,U,R,D))
+            new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2)      # 图片name
+            # cur_svg.save(res_dir_path + '/' + new_svg_name)        # 把图片存出在新建的文件夹，并命名
+            svg_final_names_2.append(new_svg_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            svg_final_bboxs_2.append((L, U, R, D))
+            svg_final_visited_2.append(False)
+            svg_ID_2 += 1
+    ## svg收尾。识别为drawing，但是在上面没有拼成一张图的。
+    # 有收尾才comprehensive
+    # xxxx
+    # xxxx
+    # xxxx
+    # xxxx
+    #--------- 通过json_from_DocXchain来获取，figure, table, equation的bbox ---------#
+    figure_bbox_from_DocXChain = []
+    figure_from_DocXChain_visited = []          # 记忆化
+    figure_bbox_from_DocXChain_overlappedRatio = []
+    figure_only_from_DocXChain_bboxs = []     # 存储
+    figure_only_from_DocXChain_names = []
+    figure_only_from_DocXChain_visited = []
+    figure_only_ID = 0
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # figure
+        if xf["category_id"] == 1 and xf['score'] >= 0.3:
+            figure_bbox_from_DocXChain.append((L, U, R, D))
+            figure_from_DocXChain_visited.append(False)
+            figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
+    #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
+    ## 比对imgs
+    for i, b1 in enumerate(figure_bbox_from_DocXChain):
+        # print('--------- DocXChain的图片', b1)
+        L1, U1, R1, D1 = b1
+        for b2 in img_bboxs:
+            # print('-------- igms得到的图', b2)
+            L2, U2, R2, D2 = b2
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            # 相同
+            if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            # 包含
+            elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                if s2 / s1 > 0.8:
+                    figure_from_DocXChain_visited[i] = True
+            elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                if s1 / s2 > 0.8:
+                    figure_from_DocXChain_visited[i] = True 
+            else:
+                # 重叠了相当一部分
+                # print('进入第3部分')
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
+                    # print('图片的重叠率是{}'.format(ratio_1))
+    ## 比对svgs
+    svg_final_bboxs_2_badIdxs = []
+    for i, b1 in enumerate(figure_bbox_from_DocXChain):
+        L1, U1, R1, D1 = b1
+        for j, b2 in enumerate(svg_final_bboxs_2):
+            L2, U2, R2, D2 = b2
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            # 相同
+            if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            # 包含
+            elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                if s1 / s2 > 0.7:
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    svg_final_bboxs_2_badIdxs.append(j)     # svg丢弃。用DocXChain的结果。
+            else:
+                # 重叠了相当一部分
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
+    # 丢掉错误的svg
+    svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
+    for i in range(len(figure_from_DocXChain_visited)):
+        if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
+            figure_from_DocXChain_visited[i] = True
+    # DocXChain识别出来的figure，但是没被保存的。
+    for i in range(len(figure_from_DocXChain_visited)):
+        if figure_from_DocXChain_visited[i] == False:
+            figure_from_DocXChain_visited[i] = True
+            cur_bbox = figure_bbox_from_DocXChain[i]
+            # cur_figure = page.get_pixmap(clip=cur_bbox)
+            new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID)      # 图片name
+            # cur_figure.save(res_dir_path + '/' + new_figure_name)        # 把图片存出在新建的文件夹，并命名
+            figure_only_from_DocXChain_names.append(new_figure_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            figure_only_from_DocXChain_bboxs.append(cur_bbox)
+            figure_only_from_DocXChain_visited.append(False)
+            figure_only_ID += 1
+    img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
+    #--------------------------- 最后统一去重 -----------------------------------#
+    curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
+    #### 先考虑包含关系的小块
+    final_duplicate = set()
+    for i in range(len(curPage_all_fig_bboxs)):
+        L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
+        for j in range(len(curPage_all_fig_bboxs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                final_duplicate.add((L1, U1, R1, D1))
+            else:
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if ratio_1 >= 0.8 and ratio_2 <= 0.6:
+                    final_duplicate.add((L1, U1, R1, D1))
+    curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
+    #### 再考虑重叠关系的块
+    final_duplicate = set()
+    final_synthetic_bboxs = []
+    for i in range(len(curPage_all_fig_bboxs)):
+        L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
+        for j in range(len(curPage_all_fig_bboxs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+            union_ok = False
+            if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+                union_ok = True
+            if (ratio_1 > 0.2 and s2 / s1 > 5):
+                union_ok = True
+            if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+                union_ok = True
+            if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+                union_ok = True
+            if union_ok == True:
+                final_duplicate.add((L1, U1, R1, D1))
+                final_duplicate.add((L2, U2, R2, D2))
+                L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
+                final_synthetic_bboxs.append((L3, U3, R3, D3))
+    # print('---------- curPage_all_fig_bboxs ---------')
+    # print(curPage_all_fig_bboxs)
+    curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]    
+    final_synthetic_bboxs = list(set(final_synthetic_bboxs))
+    ## 再再考虑重叠关系。极端情况下会迭代式地2进1
+    new_images = []
+    droped_img_idx = []
+    image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]        
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            if j not in droped_img_idx:
+                L2, U2, R2, D2 = image_bboxes[j]
+                s1 = abs(R1 - L1) * abs(D1 - U1)
+                s2 = abs(R2 - L2) * abs(D2 - U2)
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                union_ok = False
+                if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+                    union_ok = True
+                if (ratio_1 > 0.2 and s2 / s1 > 5):
+                    union_ok = True
+                if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+                    union_ok = True
+                if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+                    union_ok = True
+                if union_ok == True:
+                    # 合并
+                    image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
+                    droped_img_idx.append(j)
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+    # find_union_FLAG = True
+    # while find_union_FLAG == True:
+    #     find_union_FLAG = False
+    #     final_duplicate = set()
+    #     tmp = []
+    #     for i in range(len(final_synthetic_bboxs)):
+    #         L1, U1, R1, D1 = final_synthetic_bboxs[i]
+    #         for j in range(len(final_synthetic_bboxs)):
+    #             if i == j:
+    #                 continue
+    #             L2, U2, R2, D2 = final_synthetic_bboxs[j]
+    #             s1 = abs(R1 - L1) * abs(D1 - U1)
+    #             s2 = abs(R2 - L2) * abs(D2 - U2)
+    #             ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+    #             union_ok = False
+    #             if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+    #                 union_ok = True
+    #             if (ratio_1 > 0.2 and s2 / s1 > 5):
+    #                 union_ok = True
+    #             if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+    #                 union_ok = True
+    #             if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+    #                 union_ok = True
+    #             if union_ok == True:
+    #                 find_union_FLAG = True
+    #                 final_duplicate.add((L1, U1, R1, D1))
+    #                 final_duplicate.add((L2, U2, R2, D2))
+    #                 L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
+    #                 tmp.append((L3, U3, R3, D3)) 
+    #     if find_union_FLAG == True:
+    #         tmp = list(set(tmp))
+    #         final_synthetic_bboxs = tmp[:]
+    # curPage_all_fig_bboxs += final_synthetic_bboxs
+    # print('--------- final synthetic')
+    # print(final_synthetic_bboxs)
+    #**************************************************************************#
+    images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
+    images = images1 + new_images
+    return images
--- a/pdf2text_recogFooter.py
+++ b/pdf2text_recogFooter.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 footer ---------#
+    footer_bbox_from_DocXChain = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 6 and xf['score'] >= 0.3:
+            footer_bbox_from_DocXChain.append((L, U, R, D))
+    footer_final_names = []
+    footer_final_bboxs = []
+    footer_ID = 0
+    for L, U, R, D in footer_bbox_from_DocXChain:
+        # cur_footer = page.get_pixmap(clip=(L,U,R,D))
+        new_footer_name = "footer_{}_{}.png".format(page_ID, footer_ID)    # 脚注name
+        # cur_footer.save(res_dir_path + '/' + new_footer_name)           # 把页脚存储在新建的文件夹，并命名
+        footer_final_names.append(new_footer_name)                        # 把脚注的名字存在list中
+        footer_final_bboxs.append((L, U, R, D))
+        footer_ID += 1
+    footer_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_footer_bboxs = footer_final_bboxs
+    return curPage_all_footer_bboxs
--- a/pdf2text_recogFootnote.py
+++ b/pdf2text_recogFootnote.py
+import os
+from collections import Counter
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 footnote ---------#
+    footnote_bbox_from_DocXChain = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # if xf['category_id'] == 5 and xf['score'] >= 0.3:
+        if xf['category_id'] == 5 and xf['score'] >= 0.43:  # 新的footnote阈值
+            footnote_bbox_from_DocXChain.append((L, U, R, D))
+    footnote_final_names = []
+    footnote_final_bboxs = []
+    footnote_ID = 0
+    for L, U, R, D in footnote_bbox_from_DocXChain:
+        if debug_mode:
+            # cur_footnote = page.get_pixmap(clip=(L,U,R,D))
+            new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID)    # 脚注name
+            # cur_footnote.save(md_bookname_save_path + '/' + new_footnote_name)           # 把脚注存储在新建的文件夹，并命名
+            footnote_final_names.append(new_footnote_name)                        # 把脚注的名字存在list中
+        footnote_final_bboxs.append((L, U, R, D))
+        footnote_ID += 1
+    footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_footnote_bboxs = footnote_final_bboxs
+    return curPage_all_footnote_bboxs
+def need_remove(block):
+    if 'lines' in block and len(block['lines']) > 0:
+        # block中只有一行，且该行文本全是大写字母，或字体为粗体bold关键词，SB关键词，把这个block捞回来
+        if len(block['lines']) == 1:
+            if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1:
+                font_keywords = ['SB', 'bold', 'Bold']
+                if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords):
+                    return True
+        for line in block['lines']:
+            if 'spans' in line and len(line['spans']) > 0:
+                for span in line['spans']:
+                    # 检测"keyword"是否在span中，忽略大小写
+                    if "keyword" in span['text'].lower():
+                        return True
+    return False
+def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font):
+    """
+    根据给定的文本块、页高和页码，解析出符合规则的脚注文本块，并返回其边界框。
+    Args:
+        remain_text_blocks (list): 包含所有待处理的文本块的列表。
+        page_height (float): 页面的高度。
+        page_id (int): 页面的ID。
+    Returns:
+        list: 符合规则的脚注文本块的边界框列表。
+    """
+    if page_id > 20:
+        return []
+    else:
+        # 存储每一行的文本块大小的列表
+        line_sizes = []
+        # 存储每个文本块的平均行大小
+        block_sizes = []
+        # 存储每一行的字体信息
+        # font_names = []
+        font_names = Counter()
+        if len(remain_text_blocks) > 0:
+            for block in remain_text_blocks:
+                block_line_sizes = []
+                # block_fonts = []
+                block_fonts = Counter()
+                for line in block['lines']:
+                    # 提取每个span的size属性，并计算行大小
+                    span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
+                    if span_sizes:
+                        line_size = sum(span_sizes) / len(span_sizes)
+                        line_sizes.append(line_size)
+                        block_line_sizes.append(line_size)
+                    span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
+                    if span_font:
+                        # # todo main_text_font应该用基于字数最多的字体而不是span级别的统计
+                        # font_names.append(font_name for font_name in span_font)
+                        # block_fonts.append(font_name for font_name in span_font)
+                        for font, count in span_font:
+                            # font_names.extend([font] * count)
+                            # block_fonts.extend([font] * count)
+                            font_names[font] += count
+                            block_fonts[font] += count
+                if block_line_sizes:
+                    # 计算文本块的平均行大小
+                    block_size = sum(block_line_sizes) / len(block_line_sizes)
+                    # block_font = collections.Counter(block_fonts).most_common(1)[0][0]
+                    block_font = block_fonts.most_common(1)[0][0]
+                    block_sizes.append((block, block_size, block_font))
+            # 计算main_text_size
+            main_text_size = Counter(line_sizes).most_common(1)[0][0]
+            # 计算main_text_font
+            # main_text_font = collections.Counter(font_names).most_common(1)[0][0]
+            # main_text_font = font_names.most_common(1)[0][0]
+            # 删除一些可能被误识别为脚注的文本块
+            block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)]
+            # 检测footnote_block 并返回 footnote_bboxes
+            # footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
+            #                    block['bbox'][1] > page_height * 0.6 and block_size < main_text_size
+            #                    and (len(block['lines']) < 5 or block_font != main_text_font)]
+                               # and len(block['lines']) < 5]
+            footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
+                               block['bbox'][1] > page_height * 0.6 and
+                               sum([block_size < main_text_size,
+                                    len(block['lines']) < 5,
+                                    block_font != main_text_font]) >= 2]
+            return footnote_bboxes
+        else:
+            return []
--- a/pdf2text_recogFootnoteLine.py
+++ b/pdf2text_recogFootnoteLine.py
+import io
+import re
+import os
+import json
+from libs.boxbase import _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
+from libs.commons import fitz
+from fitz import Point
+from pprint import pprint
+import pickle
+import collections
+from typing import List
+def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
+    # 计算两个rect，重叠面积各占2个rect面积的比例
+    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
+        return 0, 0
+    square_1 = (R1 - L1) * (D1 - U1)
+    square_2 = (R2 - L2) * (D2 - U2)
+    if square_1 == 0 or square_2 == 0:
+        return 0, 0
+    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
+    return square_overlap / square_1, square_overlap / square_2
+def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
+    # 计算两个line，重叠区间各占2个line长度的比例
+    if max(L1, L2) > min(R1, R2):
+        return 0, 0
+    if L1 == R1 or L2 == R2:
+        return 0, 0
+    overlap_line = min(R1, R2) - max(L1, L2)
+    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
+def parse_footnoteLine(page_ID: int, page: fitz.Page, json_from_DocXchain_obj, exclude_bboxes):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #---------------------- PyMuPDF解析text --------------------#
+    textSize_freq = collections.defaultdict(float)        # text块中，textSize的频率
+    textBlock_bboxs = []
+    textLine_bboxs = []
+    text_blocks = page.get_text(
+            "dict",
+            flags=fitz.TEXTFLAGS_TEXT,
+            #clip=clip,
+        )["blocks"]
+    totText_list = []
+    for i in range(len(text_blocks)):
+        # print(blocks[i])                #### print
+        bbox = text_blocks[i]['bbox']
+        textBlock_bboxs.append(bbox)
+        # print(bbox) 
+        cur_block_text_list = []
+        for tt in text_blocks[i]['lines']:
+            # 当前line
+            cur_line_text_list = []
+            cur_line_bbox = None                            # 当前line，最右侧的section的bbox
+            for xf in tt['spans']:
+                L, U, R, D = xf['bbox']
+                L, R = min(L, R), max(L, R)
+                U, D = min(U, D), max(U, D)
+                textLine_bboxs.append((L, U, R, D))
+                cur_line_text_list.append(xf['text'])
+                textSize_freq[xf['size']] += len(xf['text'])
+            cur_lines_text = ' '.join(cur_line_text_list)
+            cur_block_text_list.append(cur_lines_text)
+        totText_list.append('\n'.join(cur_block_text_list))
+    totText = '\n'.join(totText_list)
+    # print(totText)                              # 打印Text
+    textLine_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
+    textBlock_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
+    # print('------------ textSize_freq -----------')
+    max_sizeFreq = 0                        # 出现频率最高的textSize
+    textSize_withMaxFreq = 0
+    for x, f in textSize_freq.items():
+        # print(x, f)
+        if f > max_sizeFreq:
+            max_sizeFreq = f
+            textSize_withMaxFreq = x
+    #**********************************************************#
+    #------------------ PyMuPDF读取drawings -----------------#
+    horizon_lines = []
+    drawings = page.get_cdrawings()
+    for drawing in drawings:
+        try:
+            rect = drawing['rect']
+            L, U, R, D = rect
+            # if (L, U, R, D) in exclude_bboxes:
+            #     continue        # 如果是Fiugre, Table, Equation。注释掉是因为，可以暂时先不消，先自我对消。最后再判读需不需要排除。
+            # 如果是水平线
+            if U <= D and D - U <= 3:
+                # 如果长度够
+                if (pageR - pageL) / 15 <= R - L:
+                    if not(80/800 * pageD <= U <= 750/800 * pageD):
+                        continue    # 很可能是页眉和页脚的线
+                    horizon_lines.append((L, U, R, D))
+                    # print((L, U, R, D))
+        except:
+            pass
+    horizon_lines.sort(key = lambda LURD: (LURD[1]))
+    #********************************************************#
+    #----------------- 两条线可能是在表格中 ------------------#
+    def has_text_below_line(L: float, U: float, R: float, D: float, inLowerArea: bool) -> bool:
+        """
+        检查线下是否紧挨着text
+        """
+        Uu, Du = U - textSize_withMaxFreq, U        # 线上的一个矩形
+        Lu, Ru = L, R
+        Ud, Dd = U, U + textSize_withMaxFreq        # 线下的一个矩形
+        Ld, Rd = L, R
+        find = 0                        # 在线下的文字。统计面积。
+        leftTextCnt = 0                 # 不在线底下的文字（整体在线左侧的文字），说明不是个脚注线。统计面积。
+        English_alpha_cnt = 0           # 英文字母个数
+        nonEnglish_alpha_cnt = 0        # 非英文字母个数
+        punctuation_mark_cnt = 0        # 常见标点符号个数
+        digit_cnt = 0                   # 数字个数
+        distance_nearest_up_line = None
+        distance_nearest_down_line = None
+        for i in range(len(text_blocks)):
+            # print(blocks[i])                #### print
+            bbox = text_blocks[i]['bbox']
+            L0, U0, R0, D0 = bbox
+            if 0< (R0 - L0) < pageR / 6 and (D0 - U0) / (R0 - L0) > 10 :
+                continue                # 一个很窄的，竖直的长条。比如，arXiv预印本，左侧的arXiv标志信息。
+            textBlock_bboxs.append(bbox)
+            # print(bbox) 
+            cur_block_text_list = []
+            for tt in text_blocks[i]['lines']:
+                # 当前line
+                cur_line_text_list = []
+                cur_line_bbox = None                            # 当前line，最右侧的section的bbox
+                for xf in tt['spans']:
+                    L2, U2, R2, D2 = xf['bbox']
+                    L2, R2 = min(L2, R2), max(L2, R2)
+                    U2, D2 = min(U2, D2), max(U2, D2)
+                    textLine = xf['text']
+                    if L>0 and L2 < L and (L - L2) / L > 0.2:                        
+                        leftTextCnt += abs(R2 - L2) * abs(D2 - U2)
+                    else:
+                        ## 线下的部分
+                        ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(Ud, Dd, U2, D2)
+                        ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(Ld, Rd, L2, R2)
+                        if U < (U2 + D2) / 2 and ratio_1 > 0 and ratio_2 > 0:
+                            if max(ratio_3, ratio_4) > 0.8:
+                                # if 444 <= U1 < 445 and 55 <= L2 < 56:
+                                #     print('匹配的框', L2, U2, R2, D2)
+                                # if xf['size'] > 1.2 * textSize_withMaxFreq:
+                                #     return False        # 可能是个标题。不能这样卡
+                                find += abs(R2 - L2) * abs(D2 - U2)
+                                distance_nearest_down_line = (U2 + D2) / 2 - U
+                                for c in textLine:
+                                    if c == ' ':
+                                        continue
+                                    elif c.isdigit() == True:
+                                        digit_cnt += 1
+                                    elif c in ',.:!?[]()%，。、！？：【】（）《》-':
+                                        punctuation_mark_cnt += 1
+                                    elif c.isalpha() == True:
+                                        English_alpha_cnt += 1
+                                    else:
+                                        nonEnglish_alpha_cnt += 1
+                        ## 线上的部分
+                        ratio_5, ratio_6 = calculate_overlapRatio_between_line1_and_line2(Uu, Du, U2, D2)
+                        ratio_7, ratio_8 = calculate_overlapRatio_between_line1_and_line2(Lu, Ru, L2, R2)
+                        if (U2 + D2) / 2 < U and ratio_5 > 0 and ratio_6 > 0:
+                            if max(ratio_7, ratio_8) > 0.8:
+                                distance_nearest_up_line = U - (U2 + D2) / 2
+                                # if distance_nearest_up_line < 0:
+                                #     print(Lu, Uu, Ru, Du, L2, U2, R2, D2)
+        # print(distance_nearest_up_line, distance_nearest_down_line)
+        if distance_nearest_up_line != None and distance_nearest_down_line != None:
+            if distance_nearest_up_line * 1.5 < distance_nearest_down_line:
+                return False                        # 如果，一根线。距离上面的文字line更近。说明是个下划线，而不是footnoteLine
+        ## 在上面的线条，要考虑左侧的text块儿。在很靠下的线条，就暂时不考虑左侧text块儿了。
+        if inLowerArea == False:
+            if leftTextCnt >= 2000/500000 * pageR * pageD:
+                return False
+            return find >= 0 and (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) >= 10
+        ## 最下面区域的线条，判断时。
+        # print(English_alpha_cnt, nonEnglish_alpha_cnt, digit_cnt)
+        if (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) == 0:
+            return False
+        if (English_alpha_cnt + digit_cnt) / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.5:
+            if nonEnglish_alpha_cnt / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.4:
+                return False
+            else:
+                return True
+        return True
+    visited = [False for _ in range(len(horizon_lines))]
+    for i, b1 in enumerate(horizon_lines):
+        for j in range(i + 1, len(horizon_lines)):
+            L1, U1, R1, D1 = horizon_lines[i]
+            L2, U2, R2, D2 = horizon_lines[j]
+            ## 在一条水平线，且挨着
+            if L1 > L2:
+                L1, U1, R1, D1, L2, U2, R2, D2 = L2, U2, R2, D2, L1, U1, R1, D1
+            in_horizontal_line_flag = (max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5) and (L2 - R1 <= pageR/10)
+            if in_horizontal_line_flag == True:
+                visited[i] = True
+                visited[j] = True
+            ## 在竖直方向上是一致的。(表格，或者有的文章就是喜欢划线）
+            L1, U1, R1, D1 = horizon_lines[i]
+            L2, U2, R2, D2 = horizon_lines[j]            
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
+            # print(L1, U1, R1, D1, L2, U2, R2, D2, ratio_1, ratio_2)
+            in_vertical_line_flag = (ratio_1 > 0.9 and ratio_2 > 0.9) or (max(ratio_1, ratio_2) > 0.95)
+            if in_vertical_line_flag == True:
+                visited[i] = True         
+                # if (U2 < pageD * 0.8 or (U2 - U1) < pageD * 0.3) and has_text_below_line(L2, U2, R2, D2, False) == False:
+                #     visited[j] = True             # 最最底下的线先不要动
+            else:
+                if ratio_1 > 0 and (R2 - L2) / (R1 - L1) > 1:
+                    visited[i] = True
+    # print(horizon_lines)
+    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
+    # print(horizon_lines)
+    #*****************************************************************#    
+    #------- 靠上的，就不是脚注。用一个THRESHOLD直接卡掉位于上半页的 -------#
+    visited = [False for _ in range(len(horizon_lines))]
+    THRESHOLD = (pageD - pageU) * 0.5
+    for i, (L, U, R, D) in enumerate(horizon_lines):
+        if U < THRESHOLD:
+            visited[i] = True
+    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
+    #******************************************************#
+    #--------------- 此时，还有遮挡的，上面的丢弃 ---------------#
+    visited = [False for _ in range(len(horizon_lines))]
+    for i, (L1, U1, R1, D1) in enumerate(horizon_lines):
+        for j in range(i + 1, len(horizon_lines)):
+            L2, U2, R2, D2 = horizon_lines[j]
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
+            if (ratio_1 > 0.2 and ratio_2 > 0.2) or max(ratio_1, ratio_2) > 0.7:
+                visited[i] = True
+    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
+    #********************************************************#
+    # print(horizon_lines)
+    ## 检查，线下面有没有紧挨着的text
+    horizon_lines = [LURD for LURD in horizon_lines if has_text_below_line(*(LURD), True) == True]
+    # print(horizon_lines)
+    ## 卡一下长度
+    # horizon_lines = [LURD for LURD in horizon_lines if (LURD[2] - LURD[0] >= pageR / 10)]
+    ## 上面最多保留2条
+    horizon_lines = horizon_lines[max(-2, -len(horizon_lines)) :]
+    #----------------------------------------------------- 第2段 -----------------------------------------------------------#
+    #----------------------------------- 最下面的情形，用距离硬卡。还有在右侧的情形就被包含了 -----------------------------------#
+    #------------------ PyMuPDF读取drawings -----------------#
+    down_horizon_lines = []
+    drawings = page.get_cdrawings()
+    for drawing in drawings:
+        try:
+            rect = drawing['rect']
+            L, U, R, D = rect
+            # if (L, U, R, D) in exclude_bboxes:
+            #     continue        # 如果是Fiugre, Table, Equation。目前是Figure识别的比较好。但是Table和Equation识别的不好
+            # 如果是水平线
+            if U <= D and D - U <= 3 and U > pageD * 0.85:
+                # 如果长度够
+                if (pageR - pageL) / 15 <= R - L:
+                    down_horizon_lines.append((L, U, R, D))
+                    # print((L, U, R, D))
+        except:
+            pass
+    down_horizon_lines.sort(key = lambda LURD: (LURD[0], LURD[2], LURD[1]))
+    visited = [False for _ in range(len(down_horizon_lines))]
+    for i in range(len(down_horizon_lines) - 1):
+        L1, U1, R1, D1 = down_horizon_lines[i]
+        L2, U2, R2, D2 = down_horizon_lines[i + 1]
+        ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
+        if ratio_1 <= 0.1 and ratio_2 <= 0.1:
+            if L2 - R1 <= pageR / 3:
+                visited[i] = True
+                visited[i + 1] = True
+    down_horizon_lines = [down_horizon_lines[i] for i in range(len(down_horizon_lines)) if visited[i] == False]
+    down_horizon_lines = [LURD for LURD in down_horizon_lines if has_text_below_line(*(LURD), True) == True]
+    # for LURD in down_horizon_lines:
+    #     print('第2阶段，LURD是： ', LURD)
+    #     print(has_text_below_line(*(LURD), True))
+    footnoteLines = horizon_lines + down_horizon_lines
+    footnoteLines = list(set(footnoteLines))
+    footnoteLines = footnoteLines[max(-2, -len(footnoteLines)) : ]
+    #-------------------------- 最后再检查一遍。是否在图片、表格、公式中。 ------------------------------#
+    def line_in_specialBboxes(L: float, U: float, R: float, D: float, specialBboxes) -> bool:
+        L2, U2, R2, D2 = L, U, R, D     # 当前这根线
+        for L1, U1, R1, D1 in specialBboxes:
+            if U1 <= U2 <= D2 < D1:
+                ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
+                if ratio_1 > 0 and ratio_2 > 0.6:
+                    return True
+            # else:
+                # U1 -= min(textSize_withMaxFreq * 2, 20)
+                # D1 += min(textSize_withMaxFreq * 2, 20)
+                # if U1 <= U2 <= D2 < D1:
+                #     ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
+                #     if ratio_1 > 0 and ratio_2 > 0.8:
+                #         return True
+        return False                
+    footnoteLines = [LURD for LURD in footnoteLines if line_in_specialBboxes(*(LURD), exclude_bboxes) == False]
+    #-------------------------- 检查，线，是否在当前column的左侧，而不是在一段文字的中间 （通过DocXChain识别的column或者徐超老师写的Layout识别）------------------------------#
+    # #--------- 通过json_from_DocXchain来获取 column ---------#
+    # column_bbox_from_DocXChain = []
+    # xf_json = json_from_DocXchain_obj
+    # width_from_json = xf_json['page_info']['width']
+    # height_from_json = xf_json['page_info']['height']
+    # LR_scaleRatio = width_from_json / (pageR - pageL)
+    # UD_scaleRatio = height_from_json / (pageD - pageU)
+    # # {0: 'title',  # 标题
+    # # 1: 'figure', # 图片
+    # #  2: 'plain text',  # 文本
+    # #  3: 'header',      # 页眉
+    # #  4: 'page number', # 页码
+    # #  5: 'footnote',    # 脚注
+    # #  6: 'footer',      # 页脚
+    # #  7: 'table',       # 表格
+    # #  8: 'table caption',  # 表格描述
+    # #  9: 'figure caption', # 图片描述
+    # #  10: 'equation',      # 公式
+    # #  11: 'full column',   # 单栏
+    # #  12: 'sub column',    # 多栏
+    # #  13: 'embedding',     # 嵌入公式
+    # #  14: 'isolated'}      # 单行公式
+    # for xf in xf_json['layout_dets']:
+    #     L = xf['poly'][0] / LR_scaleRatio
+    #     U = xf['poly'][1] / UD_scaleRatio
+    #     R = xf['poly'][2] / LR_scaleRatio
+    #     D = xf['poly'][5] / UD_scaleRatio
+    #     # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+    #     # R += pageL
+    #     # U += pageU
+    #     # D += pageU
+    #     L, R = min(L, R), max(L, R)
+    #     U, D = min(U, D), max(U, D)
+    #     if (xf['category_id'] == 11 or xf['category_id'] == 12) and xf['score'] >= 0.3:
+    #         column_bbox_from_DocXChain.append((L, U, R, D))
+    #---------------手写，检查，线是否是与某个column的左端对齐 ------------------#
+    def check_isOnTheLeftOfColumn(L: float, U: float, R: float, D: float) -> bool:
+        LL = L - textSize_withMaxFreq
+        RR = LL
+        UU = max(pageD * 0.02, U - 100/800 * pageD)
+        DD = min(U + 50/800 * pageD, pageD * 0.98)
+        # print(LL, UU, RR, DD)
+        cnt = 0
+        for bbox in textLine_bboxs:
+            L2, U2, R2, D2 = bbox
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(UU, DD, U2, D2)
+            ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
+            if ratio_1 > 0 and ratio_2 > 0:
+                if max(ratio_3, ratio_4) > 0.8:
+                    if abs(LL - L2) <= 20/700 * pageR:
+                        cnt += 1
+                    # else:
+                    #     if (R2 - L2) >= 30/700 * pageR:
+                    #         print(LL, UU, RR, DD, L2, U2, R2, D2)
+                    #         return False                  # 不能这样卡。有些注释里面，单独的特殊符号就是一个textLineBbox
+        # print('cnt: ', cnt)
+        return cnt >= 4
+    # def check_isOnTheLeftOfColumn_considerLayout(L0: float, U0: float, R0: float, D0: float) -> bool:
+    #     LL = L0 - textSize_withMaxFreq * 1.5
+    #     RR = LL
+    #     UU = 100/800 * pageD
+    #     DD = 700/800 * pageD
+    #     STEP = textSize_withMaxFreq / 2
+    #     def check_ok(L: float, U: float, R: float, D: float) -> bool:
+    #         for bbox in textBlock_bboxs:
+    #             L2, U2, R2, D2 = bbox
+    #             ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
+    #             if max(ratio_3, ratio_4) > 0.8:
+    #                 if (R2 - L2) > 1/4 * pageR and L2 < LL <= RR < R2:
+    #                     if abs(LL - L2) < 50/700 * pageR or abs(RR - R2) < 50/700 * pageR:
+    #                         continue
+    #                     else:
+    #                         return False
+    #         return True
+    #     ## 先探上面
+    #     u = UU
+    #     d = U0
+    #     while u + STEP/2 < d:
+    #         mid = (u + d) / 2
+    #         if check_ok(L0, mid, R0, U0) == True:
+    #             d = mid
+    #         else:
+    #             u = mid + STEP
+    #             print(mid)
+    #     dist_up = U0 - u
+    #     print(u)
+    #     ## 再探下面
+    #     u = D0
+    #     d = DD
+    #     while u + STEP/2 < d:
+    #         mid = (u + d) / 2
+    #         if check_ok(L0, mid, R0, D0) == True:
+    #             u = mid
+    #         else:
+    #             d = mid - STEP
+    #     print(u)
+    #     print('^^^^^^^^^^^^^^')
+    #     dist_down = u - D0
+    #     if dist_up + dist_down < textSize_withMaxFreq * 10:
+    #         return False
+    #     return True
+    footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn(*(LURD)) == True]
+    # footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn_considerLayout(*(LURD)) == True]     # 不具有泛化性。不用了。
+    #--------------------------------- 通过footnoteLine获取bbox -------------------------------#
+    def get_footnoteBbox(L: float, U: float, R: float, D: float) -> (float, float, float, float):
+        """
+        检查线下是否紧挨着text
+        """
+        L1, U1, R1, D1 = L, U, R, D
+        raw_bboxes = []
+        for i in range(len(text_blocks)):
+            bbox = text_blocks[i]['bbox']
+            L2, U2, R2, D2 = bbox
+            if (D2 - U2) / (R2 - L2) > 10 and (R2 - L2) < pageR / 6:
+                continue                # 一个很窄的，竖直的长条。比如，arXiv预印本，左侧的arXiv标志信息。
+            if U2 < D2 < U1:
+                continue                # 在线上面
+            under_THRESHOLD = min(D1 + textSize_withMaxFreq * 20, pageD * 0.98)
+            if U2 < under_THRESHOLD:
+                ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
+                if max(ratio_1, ratio_2) > 0.8:
+                    raw_bboxes.append((L2, U2, R2, D2))
+        # print(L1, U1, R1, D1)
+        # print(raw_bboxes)
+        if len(raw_bboxes) == 0:
+            return []
+        raw_bboxes.sort(key = lambda LURD: (LURD[1], LURD[0]))
+        raw_bboxes = [LURD for LURD in raw_bboxes if (abs(LURD[0] - L1) < textSize_withMaxFreq * 6 or L1 < LURD[0])]  # footnote的bbox，应该都是左端对齐的
+        if len(raw_bboxes) == 0:
+            return []
+        #------------------ full column和sub column混合，肯定也不行 ------------------#
+        LL, UU, RR, DD = raw_bboxes[0]
+        for L, U, R, D in raw_bboxes:
+            LL, UU, RR, DD = min(LL, L), min(UU, U), max(RR, R), max(DD, D)
+        for L, U, R, D in raw_bboxes:
+            if (RR - LL) > pageR*0.8 and (R - L) > pageR * 0.15 and (RR - LL) / (R - L) > 2:
+                return []
+            if abs(LL - L) > textSize_withMaxFreq * 3:
+                return []       
+        #-------------------- 太高了的，full column的框。不行 ----------------------#
+        if UU < 650/800 * pageD and (RR - LL) > 0.5 * pageR:
+            return []
+        #-------------- 第一段字数很少。后面的段字数很多，也不行 ----------------#
+        if len(raw_bboxes) > 1:
+            bbox_square = []
+            for L, U, R, D in raw_bboxes:
+                cur_s = abs(R - L) * abs(D - U)
+                bbox_square.append(cur_s)
+            s0 = bbox_square[0]
+            s1n = sum(bbox_square[1: ]) / len(bbox_square[1: ])
+            if s1n / s0 > 10 or max(bbox_square) / s0 > 15:
+                return []
+        raw_bboxes += [(LL, UU, RR, DD)]
+        return raw_bboxes            
+    # print(footnoteLines)
+    footnoteBboxes = []
+    for L, U, R, D in footnoteLines:
+        cur = get_footnoteBbox(L, U, R, D)
+        if len(cur) > 0:
+            footnoteBboxes.append((L, U, R, D))
+            footnoteBboxes += cur
+    footnoteBboxes = list(set(footnoteBboxes))
+    return footnoteBboxes
+def __bbox_in(box1, box2):
+    """
+    box1是否在box2中
+    """
+    L1, U1, R1, D1 = box1
+    L2, U2, R2, D2 = box2
+    if int(L2) <= int(L1) and int(U2) <= int(U1) and int(R1) <= int(R2) and int(D1) <= int(D2):
+        return True
+    return False
+def remove_footnote_text(raw_text_block, footnote_bboxes):
+    """
+    :param raw_text_block: str类型，是当前页的文本内容
+    :param footnoteBboxes: list类型，是当前页的脚注bbox
+    """
+    footnote_text_blocks = []
+    for block in raw_text_block:
+        text_bbox = block['bbox']
+        # TODO 更严谨点在line级别做
+        if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
+        #if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
+            block['tag'] = 'footnote'
+            footnote_text_blocks.append(block)
+            #raw_text_block.remove(block)
+    # 移除，不能再内部移除，否则会出错
+    for block in footnote_text_blocks:
+        raw_text_block.remove(block)
+    return raw_text_block, footnote_text_blocks
+def remove_footnote_image(image_blocks, footnote_bboxes):
+    """
+    :param image_bboxes: list类型，是当前页的图片bbox(结构体)
+    :param footnoteBboxes: list类型，是当前页的脚注bbox
+    """
+    footnote_imgs_blocks = []
+    for image_block in image_blocks:
+        if any([__bbox_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
+            footnote_imgs_blocks.append(image_block)
+    for footnote_imgs_block in footnote_imgs_blocks:
+        image_blocks.remove(footnote_imgs_block)
+    return image_blocks, footnote_imgs_blocks
+def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, page_no_bboxs, page_w, page_h):
+    """
+    删除页眉页脚，页码
+    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
+    """
+    header = []
+    footer = []
+    if len(header)==0:
+        model_header = header_bboxs
+        if model_header:
+            x0 = min([x for x,_,_,_ in model_header])
+            y0 = min([y for _,y,_,_ in model_header])
+            x1 = max([x1 for _,_,x1,_ in model_header])
+            y1 = max([y1 for _,_,_,y1 in model_header])
+            header = [x0, y0, x1, y1]
+    if len(footer)==0:
+        model_footer = footer_bboxs
+        if model_footer:
+            x0 = min([x for x,_,_,_ in model_footer])
+            y0 = min([y for _,y,_,_ in model_footer])
+            x1 = max([x1 for _,_,x1,_ in model_footer])
+            y1 = max([y1 for _,_,_,y1 in model_footer])
+            footer = [x0, y0, x1, y1]
+    header_y0 = 0 if len(header) == 0 else header[3]
+    footer_y0 = page_h if len(footer) == 0 else footer[1]
+    if page_no_bboxs:
+        top_part = [b for b in page_no_bboxs if b[3] < page_h/2]
+        btn_part = [b for b in page_no_bboxs if b[1] > page_h/2]
+        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
+        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
+        header_y0 = max(header_y0, top_max_y0)
+        footer_y0 = min(footer_y0, btn_min_y1)
+    content_boundry = [0, header_y0, page_w, footer_y0]
+    header = [0,0, page_w, header_y0]
+    footer = [0, footer_y0, page_w, page_h]
+    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
+    text_block_to_remove = []
+    # 首先检查每个textblock
+    for blk in text_raw_blocks:
+        if len(blk['lines']) > 0:
+            for line in blk['lines']:
+                line_del = []
+                for span in line['spans']:
+                    span_del = []
+                    if span['bbox'][3] < header_y0:
+                        span_del.append(span)
+                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
+                        span_del.append(span)
+                for span in span_del:
+                    line['spans'].remove(span)
+                if not line['spans']:
+                    line_del.append(line)
+            for line in line_del:
+                blk['lines'].remove(line)
+        else:
+        # if not blk['lines']:
+            blk['tag'] = 'in-foot-header-area'
+            text_block_to_remove.append(blk)
+    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
+    page_no_block_2_remove = []
+    if page_no_bboxs:
+        for pagenobox in page_no_bboxs:
+            for block in text_raw_blocks:
+                if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
+                                #span['text'] = ''
+                                span['tag'] = "page-no"
+                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
+                                if len(line['spans']) == 1 and len(block['lines'])==1:
+                                    page_no_block_2_remove.append(block)
+    else:
+        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
+        if len(text_raw_blocks) > 0:
+            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
+            last_block = text_raw_blocks[0]
+            if len(last_block['lines']) == 1:
+                last_line = last_block['lines'][0]
+                if len(last_line['spans']) == 1:
+                    last_span = last_line['spans'][0]
+                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]', last_span['text']):
+                        last_span['tag'] = "page-no"
+                        page_no_block_2_remove.append(last_block)
+    for b in page_no_block_2_remove:
+        text_block_to_remove.append(b)
+    for blk in text_block_to_remove:
+        if blk in text_raw_blocks:
+            text_raw_blocks.remove(blk)
+    text_block_remain = text_raw_blocks
+    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
--- a/pdf2text_recogHeader.py
+++ b/pdf2text_recogHeader.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 header ---------#
+    header_bbox_from_DocXChain = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 3 and xf['score'] >= 0.3:
+            header_bbox_from_DocXChain.append((L, U, R, D))
+    header_final_names = []
+    header_final_bboxs = []
+    header_ID = 0
+    for L, U, R, D in header_bbox_from_DocXChain:
+        # cur_header = page.get_pixmap(clip=(L,U,R,D))
+        new_header_name = "header_{}_{}.png".format(page_ID, header_ID)    # 页眉name
+        # cur_header.save(res_dir_path + '/' + new_header_name)           # 把页眉存储在新建的文件夹，并命名
+        header_final_names.append(new_header_name)                        # 把页面的名字存在list中
+        header_final_bboxs.append((L, U, R, D))
+        header_ID += 1
+    header_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_header_bboxs = header_final_bboxs
+    return curPage_all_header_bboxs
--- a/pdf2text_recogPageNo.py
+++ b/pdf2text_recogPageNo.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 pageNo ---------#
+    pageNo_bbox_from_DocXChain = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 4 and xf['score'] >= 0.3:
+            pageNo_bbox_from_DocXChain.append((L, U, R, D))
+    pageNo_final_names = []
+    pageNo_final_bboxs = []
+    pageNo_ID = 0
+    for L, U, R, D in pageNo_bbox_from_DocXChain:
+        # cur_pageNo = page.get_pixmap(clip=(L,U,R,D))
+        new_pageNo_name = "pageNo_{}_{}.png".format(page_ID, pageNo_ID)    # 页码name
+        # cur_pageNo.save(res_dir_path + '/' + new_pageNo_name)           # 把页码存储在新建的文件夹，并命名
+        pageNo_final_names.append(new_pageNo_name)                        # 把页码的名字存在list中
+        pageNo_final_bboxs.append((L, U, R, D))
+        pageNo_ID += 1
+    pageNo_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_pageNo_bboxs = pageNo_final_bboxs
+    return curPage_all_pageNo_bboxs
--- a/pdf2text_recogPara.py
+++ b/pdf2text_recogPara.py
+import os
+import sys
+import json
+import re
+import math
+import unicodedata
+from collections import Counter
+import numpy as np
+from termcolor import cprint
+from libs.commons import fitz
+from libs.nlp_utils import NLPModels
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+def open_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)  # type: ignore
+        return pdf_document
+    except Exception as e:
+        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
+        raise e
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+def print_yellow(text):
+    print()
+    cprint(text, "yellow", attrs=["bold"], end="\n\n")
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+def is_bbox_overlap(bbox1, bbox2):
+    """
+    This function checks if bbox1 and bbox2 overlap or not
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+    Returns
+    -------
+    bool
+        True if bbox1 and bbox2 overlap, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+    if x0_1 > x1_2 or x0_2 > x1_1:
+        return False
+    if y0_1 > y1_2 or y0_2 > y1_1:
+        return False
+    return True
+def is_in_bbox(bbox1, bbox2):
+    """
+    This function checks if bbox1 is in bbox2
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+    Returns
+    -------
+    bool
+        True if bbox1 is in bbox2, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
+        return True
+    else:
+        return False
+def calculate_para_bbox(lines):
+    """
+    This function calculates the minimum bbox of the paragraph
+    Parameters
+    ----------
+    lines : list
+        lines
+    Returns
+    -------
+    para_bbox : list
+        bbox of the paragraph
+    """
+    x0 = min(line["bbox"][0] for line in lines)
+    y0 = min(line["bbox"][1] for line in lines)
+    x1 = max(line["bbox"][2] for line in lines)
+    y1 = max(line["bbox"][3] for line in lines)
+    return [x0, y0, x1, y1]
+def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is right aligned from its neighbors
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+    Returns
+    -------
+    bool
+        True if the line is right aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+    _, _, x1, _ = curr_line_bbox
+    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+    if direction == 0:
+        return abs(x1 - prev_x1) < horizontal_thres
+    elif direction == 1:
+        return abs(x1 - next_x1) < horizontal_thres
+    elif direction == 2:
+        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
+    else:
+        return False
+def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is left aligned from its neighbors
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+    Returns
+    -------
+    bool
+        True if the line is left aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+    x0, _, _, _ = curr_line_bbox
+    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+    if direction == 0:
+        return abs(x0 - prev_x0) < horizontal_thres
+    elif direction == 1:
+        return abs(x0 - next_x0) < horizontal_thres
+    elif direction == 2:
+        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
+    else:
+        return False
+def end_with_punctuation(line_text):
+    """
+    This function checks if the line ends with punctuation marks
+    """
+    english_end_puncs = [".", "?", "!"]
+    chinese_end_puncs = ["。", "？", "！"]
+    end_puncs = english_end_puncs + chinese_end_puncs
+    last_non_space_char = None
+    for ch in line_text[::-1]:
+        if not ch.isspace():
+            last_non_space_char = ch
+            break
+    if last_non_space_char is None:
+        return False
+    return last_non_space_char in end_puncs
+def is_nested_list(lst):
+    if isinstance(lst, list):
+        return any(isinstance(sub, list) for sub in lst)
+    return False
+class DenseSingleLineBlockException(Exception):
+    """
+    This class defines the exception type for dense single line-block.
+    """
+    def __init__(self, message="DenseSingleLineBlockException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class TitleDetectionException(Exception):
+    """
+    This class defines the exception type for title detection.
+    """
+    def __init__(self, message="TitleDetectionException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class TitleLevelException(Exception):
+    """
+    This class defines the exception type for title level.
+    """
+    def __init__(self, message="TitleLevelException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class ParaSplitException(Exception):
+    """
+    This class defines the exception type for paragraph splitting.
+    """
+    def __init__(self, message="ParaSplitException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class ParaMergeException(Exception):
+    """
+    This class defines the exception type for paragraph merging.
+    """
+    def __init__(self, message="ParaMergeException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class DiscardByException:
+    """
+    This class discards pdf files by exception
+    """
+    def __init__(self) -> None:
+        pass
+    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
+        """
+        This function discards pdf files by single line block exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        exception_page_nums = 0
+        page_num = 0
+        for page_id, page in pdf_dic.items():
+            if page_id.startswith("page_"):
+                page_num += 1
+                if "preproc_blocks" in page.keys():
+                    preproc_blocks = page["preproc_blocks"]
+                    all_single_line_blocks = []
+                    for block in preproc_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
+                        exception_page_nums += 1
+        if page_num == 0:
+            return None
+        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
+            return exception.message
+        return None
+    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
+        """
+        This function discards pdf files by title detection exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
+        """
+        This function discards pdf files by title level exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
+        """
+        This function discards pdf files by split para exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
+        """
+        This function discards pdf files by merge para exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+class LayoutFilterProcessor:
+    def __init__(self) -> None:
+        pass
+    def batch_process_blocks(self, pdf_dict):
+        """
+        This function processes the blocks in batch.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        pdf_dict : dict
+            pdf dictionary
+        Returns
+        -------
+        pdf_dict : dict
+            pdf dictionary
+        """
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
+                    layout_bbox_objs = blocks["layout_bboxes"]
+                    if layout_bbox_objs is None:
+                        continue
+                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
+                    # Enlarge each value of x0, y0, x1, y1 for each layout_bbox to prevent loss of text.
+                    layout_bboxes = [
+                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
+                    ]
+                    para_blocks = blocks["para_blocks"]
+                    if para_blocks is None:
+                        continue
+                    for lb_bbox in layout_bboxes:
+                        for i, para_block in enumerate(para_blocks):
+                            para_bbox = para_block["bbox"]
+                            para_blocks[i]["in_layout"] = 0
+                            if is_in_bbox(para_bbox, lb_bbox):
+                                para_blocks[i]["in_layout"] = 1
+                    blocks["para_blocks"] = para_blocks
+        return pdf_dict
+class RawBlockProcessor:
+    def __init__(self) -> None:
+        self.y_tolerance = 2
+        self.pdf_dic = {}
+    def __span_flags_decomposer(self, span_flags):
+        """
+        Make font flags human readable.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        span_flags : int
+            span flags
+        Returns
+        -------
+        l : dict
+            decomposed flags
+        """
+        l = {
+            "is_superscript": False,
+            "is_italic": False,
+            "is_serifed": False,
+            "is_sans_serifed": False,
+            "is_monospaced": False,
+            "is_proportional": False,
+            "is_bold": False,
+        }
+        if span_flags & 2**0:
+            l["is_superscript"] = True  # 表示上标
+        if span_flags & 2**1:
+            l["is_italic"] = True  # 表示斜体
+        if span_flags & 2**2:
+            l["is_serifed"] = True  # 表示衬线字体
+        else:
+            l["is_sans_serifed"] = True  # 表示非衬线字体
+        if span_flags & 2**3:
+            l["is_monospaced"] = True  # 表示等宽字体
+        else:
+            l["is_proportional"] = True  # 表示比例字体
+        if span_flags & 2**4:
+            l["is_bold"] = True  # 表示粗体
+        return l
+    def __make_new_lines(self, raw_lines):
+        """
+        This function makes new lines.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        raw_lines : list
+            raw lines
+        Returns
+        -------
+        new_lines : list
+            new lines
+        """
+        new_lines = []
+        new_line = None
+        for raw_line in raw_lines:
+            raw_line_bbox = raw_line["bbox"]
+            raw_line_spans = raw_line["spans"]
+            raw_line_text = "".join([span["text"] for span in raw_line_spans])
+            raw_line_dir = raw_line.get("dir", None)
+            decomposed_line_spans = []
+            for span in raw_line_spans:
+                raw_flags = span["flags"]
+                decomposed_flags = self.__span_flags_decomposer(raw_flags)
+                span["decomposed_flags"] = decomposed_flags
+                decomposed_line_spans.append(span)
+            if new_line is None:  # Handle the first line
+                new_line = {
+                    "bbox": raw_line_bbox,
+                    "text": raw_line_text,
+                    "dir": raw_line_dir if raw_line_dir else (0, 0),
+                    "spans": decomposed_line_spans,
+                }
+            else:  # Handle the rest lines
+                if (
+                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
+                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
+                ):
+                    new_line["bbox"] = (
+                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
+                        new_line["bbox"][1],  # top
+                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
+                        raw_line_bbox[3],  # bottom
+                    )
+                    new_line["text"] += raw_line_text
+                    new_line["spans"].extend(raw_line_spans)
+                    new_line["dir"] = (
+                        new_line["dir"][0] + raw_line_dir[0],
+                        new_line["dir"][1] + raw_line_dir[1],
+                    )
+                else:
+                    new_lines.append(new_line)
+                    new_line = {
+                        "bbox": raw_line_bbox,
+                        "text": raw_line_text,
+                        "dir": raw_line_dir if raw_line_dir else (0, 0),
+                        "spans": raw_line_spans,
+                    }
+        if new_line:
+            new_lines.append(new_line)
+        return new_lines
+    def __make_new_block(self, raw_block):
+        """
+        This function makes a new block.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        raw_block : dict
+            a raw block
+        Returns
+        -------
+        new_block : dict
+        """
+        new_block = {}
+        block_id = raw_block["number"]
+        block_bbox = raw_block["bbox"]
+        block_text = "".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
+        raw_lines = raw_block["lines"]
+        block_lines = self.__make_new_lines(raw_lines)
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["lines"] = block_lines
+        return new_block
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "preproc_blocks" in blocks.keys():
+                    input_blocks = blocks["preproc_blocks"]
+                    for raw_block in input_blocks:
+                        new_block = self.__make_new_block(raw_block)
+                        para_blocks.append(new_block)
+                blocks["para_blocks"] = para_blocks
+        return pdf_dic
+class BlockStatisticsCalculator:
+    """
+    This class calculates the statistics of the block.
+    """
+    def __init__(self) -> None:
+        pass
+    def __calc_stats_of_new_lines(self, new_lines):
+        """
+        This function calculates the paragraph metrics
+        Parameters
+        ----------
+        combined_lines : list
+            combined lines
+        Returns
+        -------
+        X0 : float
+            Median of x0 values, which represents the left average boundary of the block
+        X1 : float
+            Median of x1 values, which represents the right average boundary of the block
+        avg_char_width : float
+            Average of char widths, which represents the average char width of the block
+        avg_char_height : float
+            Average of line heights, which represents the average line height of the block
+        """
+        x0_values = []
+        x1_values = []
+        char_widths = []
+        char_heights = []
+        block_font_types = []
+        block_font_sizes = []
+        block_directions = []
+        if len(new_lines) > 0:
+            for i, line in enumerate(new_lines):
+                line_bbox = line["bbox"]
+                line_text = line["text"]
+                line_spans = line["spans"]
+                num_chars = len([ch for ch in line_text if not ch.isspace()])
+                x0_values.append(line_bbox[0])
+                x1_values.append(line_bbox[2])
+                if num_chars > 0:
+                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
+                    char_widths.append(char_width)
+                for span in line_spans:
+                    block_font_types.append(span["font"])
+                    block_font_sizes.append(span["size"])
+                if "dir" in line:
+                    block_directions.append(line["dir"])
+                # line_font_types = [span["font"] for span in line_spans]
+                char_heights = [span["size"] for span in line_spans]
+        X0 = np.median(x0_values) if x0_values else 0
+        X1 = np.median(x1_values) if x1_values else 0
+        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
+        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
+        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
+        max_span_length = 0
+        max_span_font_type = None
+        for line in new_lines:
+            line_spans = line["spans"]
+            for span in line_spans:
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_span_length:
+                    max_span_length = span_length
+                    max_span_font_type = span["font"]
+        max_freq_font_type = max_span_font_type
+        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
+        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
+        return (
+            X0,
+            X1,
+            avg_char_width,
+            avg_char_height,
+            max_freq_font_type,
+            avg_font_size,
+            (avg_dir_horizontal, avg_dir_vertical),
+            median_font_size,
+        )
+    def __make_new_block(self, input_block):
+        new_block = {}
+        raw_lines = input_block["lines"]
+        stats = self.__calc_stats_of_new_lines(raw_lines)
+        block_id = input_block["block_id"]
+        block_bbox = input_block["bbox"]
+        block_text = input_block["text"]
+        block_lines = raw_lines
+        block_avg_left_boundary = stats[0]
+        block_avg_right_boundary = stats[1]
+        block_avg_char_width = stats[2]
+        block_avg_char_height = stats[3]
+        block_font_type = stats[4]
+        block_font_size = stats[5]
+        block_direction = stats[6]
+        block_median_font_size = stats[7]
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["dir"] = block_direction
+        new_block["X0"] = block_avg_left_boundary
+        new_block["X1"] = block_avg_right_boundary
+        new_block["avg_char_width"] = block_avg_char_width
+        new_block["avg_char_height"] = block_avg_char_height
+        new_block["block_font_type"] = block_font_type
+        new_block["block_font_size"] = block_font_size
+        new_block["lines"] = block_lines
+        new_block["median_font_size"] = block_median_font_size
+        return new_block
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+            Schema can refer to the value of key ""preproc_blocks".
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self.__make_new_block(input_block)
+                        para_blocks.append(new_block)
+                blocks["para_blocks"] = para_blocks
+        return pdf_dic
+class DocStatisticsCalculator:
+    """
+    This class calculates the statistics of the document.
+    """
+    def __init__(self) -> None:
+        pass
+    def calc_stats_of_doc(self, pdf_dict):
+        """
+        This function computes the statistics of the document
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+        Returns
+        -------
+        statistics : dict
+            statistics of the document
+        """
+        total_text_length = 0
+        total_num_blocks = 0
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    for para_block in para_blocks:
+                        total_text_length += len(para_block["text"])
+                        total_num_blocks += 1
+        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
+        font_list = []
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        block_text_length = len(input_block.get("text", ""))
+                        if block_text_length < avg_text_length * 0.5:
+                            continue
+                        block_font_type = safe_get(input_block, "block_font_type", "")
+                        block_font_size = safe_get(input_block, "block_font_size", 0)
+                        font_list.append((block_font_type, block_font_size))
+        font_counter = Counter(font_list)
+        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
+        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
+        statistics = {
+            "num_pages": 0,
+            "num_blocks": 0,
+            "num_paras": 0,
+            "num_titles": 0,
+            "num_header_blocks": 0,
+            "num_footer_blocks": 0,
+            "num_watermark_blocks": 0,
+            "num_vertical_margin_note_blocks": 0,
+            "most_common_font_type": most_common_font[0][0],
+            "most_common_font_size": most_common_font[0][1],
+            "number_of_most_common_font": most_common_font[1],
+            "second_most_common_font_type": second_most_common_font[0][0],
+            "second_most_common_font_size": second_most_common_font[0][1],
+            "number_of_second_most_common_font": second_most_common_font[1],
+            "avg_text_length": avg_text_length,
+        }
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                blocks = pdf_dict[page_id]["para_blocks"]
+                statistics["num_pages"] += 1
+                for block_id, block_data in enumerate(blocks):
+                    statistics["num_blocks"] += 1
+                    if "paras" in block_data.keys():
+                        statistics["num_paras"] += len(block_data["paras"])
+                    for line in block_data["lines"]:
+                        if line.get("is_title", 0):
+                            statistics["num_titles"] += 1
+                    if block_data.get("is_header", 0):
+                        statistics["num_header_blocks"] += 1
+                    if block_data.get("is_footer", 0):
+                        statistics["num_footer_blocks"] += 1
+                    if block_data.get("is_watermark", 0):
+                        statistics["num_watermark_blocks"] += 1
+                    if block_data.get("is_vertical_margin_note", 0):
+                        statistics["num_vertical_margin_note_blocks"] += 1
+        pdf_dict["statistics"] = statistics
+        return pdf_dict
+class TitleProcessor:
+    """
+    This class processes the title.
+    """
+    def __init__(self, *doc_statistics) -> None:
+        if len(doc_statistics) > 0:
+            self.doc_statistics = doc_statistics[0]
+        self.nlp_model = NLPModels()
+        self.MAX_TITLE_LEVEL = 3
+        self.numbered_title_pattern = r"""
+            ^                                 # 行首
+            (                                 # 开始捕获组
+                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
+                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
+                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
+                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
+                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
+                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
+                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
+                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
+                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
+                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
+                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
+                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
+                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
+                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
+                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
+                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
+                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
+            )                                 # 结束捕获组
+            .+                                # 标题的其余部分
+        """
+    def _is_potential_title(
+        self,
+        curr_line,
+        prev_line,
+        prev_line_is_title,
+        next_line,
+        avg_char_width,
+        avg_char_height,
+        median_font_size,
+    ):
+        """
+        This function checks if the line is a potential title.
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        avg_char_width : float
+            average of char widths
+        avg_char_height : float
+            average of line heights
+        Returns
+        -------
+        bool
+            True if the line is a potential title, False otherwise.
+        """
+        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
+            """
+            This function checks if the line is centered on the page
+            Parameters
+            ----------
+            line_bbox : list
+                bbox of the line
+            page_bbox : list
+                bbox of the page
+            avg_char_width : float
+                average of char widths
+            Returns
+            -------
+            bool
+                True if the line is centered on the page, False otherwise.
+            """
+            horizontal_ratio = 0.5
+            horizontal_thres = horizontal_ratio * avg_char_width
+            x0, _, x1, _ = line_bbox
+            _, _, page_x1, _ = page_bbox
+            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
+        def __is_bold_font_line(line):
+            """
+            Check if a line contains any bold font style.
+            """
+            def _is_bold_span(span):
+                # if span text is empty or only contains space, return False
+                if not span["text"].strip():
+                    return False
+                return bool(span["flags"] & 2**4)  # Check if the font is bold
+            for span in line["spans"]:
+                if not _is_bold_span(span):
+                    return False
+            return True
+        def __is_italic_font_line(line):
+            """
+            Check if a line contains any italic font style.
+            """
+            def __is_italic_span(span):
+                return bool(span["flags"] & 2**1)  # Check if the font is italic
+            for span in line["spans"]:
+                if not __is_italic_span(span):
+                    return False
+            return True
+        def __is_punctuation_heavy(line_text):
+            """
+            Check if the line contains a high ratio of punctuation marks, which may indicate
+            that the line is not a title.
+            Parameters:
+            line_text (str): Text of the line.
+            Returns:
+            bool: True if the line is heavy with punctuation, False otherwise.
+            """
+            # Pattern for common title format like "X.Y. Title"
+            pattern = r"\b\d+\.\d+\..*\b"
+            # If the line matches the title format, return False
+            if re.match(pattern, line_text.strip()):
+                return False
+            # Find all punctuation marks in the line
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)
+            number_of_punctuation_marks = len(punctuation_marks)
+            text_length = len(line_text)
+            if text_length == 0:
+                return False
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.1:
+                return True
+            return False
+        def __has_mixed_font_styles(spans, strict_mode=False):
+            """
+            This function checks if the line has mixed font styles, the strict mode will compare the font types
+            Parameters
+            ----------
+            spans : list
+                spans of the line
+            strict_mode : bool
+                True for strict mode, the font types will be fully compared
+                False for non-strict mode, the font types will be compared by the most longest common prefix
+            Returns
+            -------
+            bool
+                True if the line has mixed font styles, False otherwise.
+            """
+            if strict_mode:
+                font_styles = set()
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.add(font_style)
+                return len(font_styles) > 1
+            else:  # non-strict mode
+                font_styles = []
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.append(font_style)
+                if len(font_styles) > 1:
+                    longest_common_prefix = os.path.commonprefix(font_styles)
+                    if len(longest_common_prefix) > 0:
+                        return False
+                    else:
+                        return True
+                else:
+                    return False
+        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
+            """
+            This function checks if the current line has a different font type from the previous and next lines
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            prev_line_font_type : str
+                font type of the previous line
+            next_line_font_type : str
+                font type of the next line
+            Returns
+            -------
+            bool
+                True if the current line has a different font type from the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_type != other_font_type.lower()
+                for other_font_type in [prev_line_font_type, next_line_font_type]
+                if other_font_type is not None
+            )
+        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
+            """
+            This function checks if the current line has a larger font size than the previous and next lines
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            prev_line_font_size : float
+                font size of the previous line
+            next_line_font_size : float
+                font size of the next line
+            Returns
+            -------
+            bool
+                True if the current line has a larger font size than the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_size > other_font_size * 1.2
+                for other_font_size in [prev_line_font_size, next_line_font_size]
+                if other_font_size is not None
+            )
+        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
+            """
+            This function checks if the current line is similar to the previous line
+            Parameters
+            ----------
+            curr_line : dict
+                current line
+            prev_line : dict
+                previous line
+            Returns
+            -------
+            bool
+                True if the current line is similar to the previous line, False otherwise.
+            """
+            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
+                return True
+            else:
+                return False
+        def __is_same_font_type_of_docAvg(curr_line_font_type):
+            """
+            This function checks if the current line has the same font type as the document average font type
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            Returns
+            -------
+            bool
+                True if the current line has the same font type as the document average font type, False otherwise.
+            """
+            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
+            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
+            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
+        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
+            """
+            This function checks if the current line has a large enough font size
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            ratio : float
+                ratio of the current line font size to the document average font size
+            Returns
+            -------
+            bool
+                True if the current line has a large enough font size, False otherwise.
+            """
+            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
+            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
+            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
+            return curr_line_font_size >= doc_avg_font_size * ratio
+        def __is_sufficient_spacing_above_and_below(
+            curr_line_bbox,
+            prev_line_bbox,
+            next_line_bbox,
+            avg_char_height,
+            median_font_size,
+        ):
+            """
+            This function checks if the current line has sufficient spacing above and below
+            Parameters
+            ----------
+            curr_line_bbox : list
+                bbox of the current line
+            prev_line_bbox : list
+                bbox of the previous line
+            next_line_bbox : list
+                bbox of the next line
+            avg_char_width : float
+                average of char widths
+            avg_char_height : float
+                average of line heights
+            Returns
+            -------
+            bool
+                True if the current line has sufficient spacing above and below, False otherwise.
+            """
+            vertical_ratio = 1.25
+            vertical_thres = vertical_ratio * median_font_size
+            _, y0, _, y1 = curr_line_bbox
+            sufficient_spacing_above = False
+            if prev_line_bbox:
+                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
+                sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            else:
+                sufficient_spacing_above = True
+            sufficient_spacing_below = False
+            if next_line_bbox:
+                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
+                sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            else:
+                sufficient_spacing_below = True
+            return (sufficient_spacing_above, sufficient_spacing_below)
+        def __is_word_list_line_by_rules(curr_line_text):
+            """
+            This function checks if the current line is a word list
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
+            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
+            compiled_pattern = re.compile(name_list_pattern)
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+        def __get_text_catgr_by_nlp(curr_line_text):
+            """
+            This function checks if the current line is a name list using nlp model, such as spacy
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
+            return result
+        def __is_numbered_title(curr_line_text):
+            """
+            This function checks if the current line is a numbered list
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a numbered list, False otherwise.
+            """
+            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+        def __is_end_with_ending_puncs(line_text):
+            """
+            This function checks if the current line ends with a ending punctuation mark
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line ends with a punctuation mark, False otherwise.
+            """
+            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+            line_text = line_text.rstrip()
+            if line_text[-1] in end_puncs:
+                return True
+            return False
+        def __contains_only_no_meaning_symbols(line_text):
+            """
+            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
+            Situation contains:
+            1. Only have punctuation marks
+            2. Only have other non-meaning symbols
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line contains only symbols that have no meaning, False otherwise.
+            """
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
+            number_of_punctuation_marks = len(punctuation_marks)
+            text_length = len(line_text)
+            if text_length == 0:
+                return False
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.9:
+                return True
+            return False
+        def __is_equation(line_text):
+            """
+            This function checks if the current line is an equation.
+            Parameters
+            ----------
+            line_text : str
+            Returns
+            -------
+            bool
+                True if the current line is an equation, False otherwise.
+            """
+            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
+            if re.search(equation_reg, line_text):
+                return True
+            else:
+                return False
+        def __is_title_by_len(text, max_length=200):
+            """
+            This function checks if the current line is a title by length.
+            Parameters
+            ----------
+            text : str
+                text of the current line
+            max_length : int
+                max length of the title
+            Returns
+            -------
+            bool
+                True if the current line is a title, False otherwise.
+            """
+            text = text.strip()
+            return len(text) <= max_length
+        def __compute_line_font_type_and_size(curr_line):
+            """
+            This function computes the font type and font size of the line.
+            Parameters
+            ----------
+            line : dict
+                line
+            Returns
+            -------
+            font_type : str
+                font type of the line
+            font_size : float
+                font size of the line
+            """
+            spans = curr_line["spans"]
+            max_accumulated_length = 0
+            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
+            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
+            for span in spans:
+                if span["text"].isspace():
+                    continue
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_accumulated_length:
+                    max_accumulated_length = span_length
+                    max_span_font_size = span["size"]
+                    max_span_font_type = span["font"].lower()
+            return max_span_font_type, max_span_font_size
+        def __is_a_consistent_sub_title(pre_line, curr_line):
+            """
+            This function checks if the current line is a consistent sub title.
+            Parameters
+            ----------
+            pre_line : dict
+                previous line
+            curr_line : dict
+                current line
+            Returns
+            -------
+            bool
+                True if the current line is a consistent sub title, False otherwise.
+            """
+            if pre_line is None:
+                return False
+            start_letter_of_pre_line = pre_line["text"][0]
+            start_letter_of_curr_line = curr_line["text"][0]
+            has_same_prefix_digit = (
+                start_letter_of_pre_line.isdigit()
+                and start_letter_of_curr_line.isdigit()
+                and start_letter_of_pre_line == start_letter_of_curr_line
+            )
+            # prefix text of curr_line satisfies the following title format: x.x
+            prefix_text_pattern = r"^\d+\.\d+"
+            has_subtitle_format = re.match(prefix_text_pattern, curr_line["text"])
+            if has_same_prefix_digit or has_subtitle_format:
+                return True
+        """
+        Title detecting main Process.
+        """
+        """
+        Basic features about the current line.
+        """
+        curr_line_bbox = curr_line["bbox"]
+        curr_line_text = curr_line["text"]
+        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
+        if len(curr_line_text.strip()) == 0:  # skip empty lines
+            return False, False
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        if prev_line:
+            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
+        else:
+            prev_line_font_type, prev_line_font_size = None, None
+        next_line_bbox = next_line["bbox"] if next_line else None
+        if next_line:
+            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
+        else:
+            next_line_font_type, next_line_font_size = None, None
+        """
+        Aggregated features about the current line.
+        """
+        is_italc_font = __is_italic_font_line(curr_line)
+        is_bold_font = __is_bold_font_line(curr_line)
+        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
+        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
+        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
+        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
+        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
+        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
+        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
+        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
+        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
+        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
+        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
+            curr_line_font_size, prev_line_font_size, next_line_font_size
+        )
+        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
+            curr_line_font_type, prev_line_font_type, next_line_font_type
+        )
+        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
+        )
+        is_similar_to_pre_line = __is_similar_to_pre_line(
+            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
+        )
+        is_consis_sub_title = __is_a_consistent_sub_title(prev_line, curr_line)
+        """
+        Further aggregated features about the current line.
+        Attention:
+            Features that start with __ are for internal use.
+        """
+        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
+        )
+        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
+        is_a_left_inline_title = (
+            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
+        )
+        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
+        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
+        is_title_by_check_pre_and_next_line = (
+            (prev_line is not None or next_line is not None)
+            and has_sufficient_spaces_above
+            and has_sufficient_spaces_below
+            and is_potential_title_font
+        )
+        is_numbered_title = __is_numbered_title(curr_line_text) and (
+            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
+        )
+        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
+        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
+        is_equation = __is_equation(curr_line_text)
+        is_title_by_len = __is_title_by_len(curr_line_text)
+        """
+        Decide if the line is a title.
+        """
+        is_title = (
+            is_not_end_with_ending_puncs  # not end with ending punctuation marks
+            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
+            and is_title_by_len  # is a title by length, default max length is 200
+            and not is_equation  # an interline equation should never be a title
+            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
+            and (
+                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                or (
+                    is_much_larger_font_than_doc_avg
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_font_size_little_less_than_doc_avg
+                    and is_bold_font
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+            )  # Consider the following situations: bold font, much larger font than doc avg, not same font type as doc avg, sufficient spacing above and below
+            and (
+                (
+                    not is_person_or_org_list_line_by_nlp
+                    and (
+                        is_much_larger_font_than_doc_avg
+                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                    )
+                )
+                or (
+                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
+                    and not is_a_left_inline_title
+                    and not is_punctuation_heavy
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_person_or_org_list_line_by_nlp
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                )
+                or (is_numbered_title and not is_a_left_inline_title)
+            )  # Exclude the following situations: person/org list
+        )
+        # ) or (prev_line_is_title and is_consis_sub_title)
+        is_name_or_org_list_to_be_removed = (
+            (is_person_or_org_list_line_by_nlp)
+            and is_punctuation_heavy
+            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+        ) and not is_title
+        if is_name_or_org_list_to_be_removed:
+            is_author_or_org_list = True
+        else:
+            is_author_or_org_list = False
+        # return is_title, is_author_or_org_list
+        """
+        # print reason why the line is a title
+        if is_title:
+            print_green("This line is a title.")
+            print_green("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+        # print reason why the line is not a title
+        line_text = curr_line_text.strip()
+        test_text = "Career/Personal Life"
+        text_content_condition = line_text == test_text
+        if not is_title and text_content_condition: # Print specific line
+        # if not is_title: # Print each line
+            print_red("This line is not a title.")
+            print_red("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+            if is_not_end_with_ending_puncs:
+                print_green(f"is_not_end_with_ending_puncs")
+            else:
+                print_red(f"is_end_with_ending_puncs")
+            if is_not_only_no_meaning_symbols:
+                print_green(f"is_not_only_no_meaning_symbols")
+            else:
+                print_red(f"is_only_no_meaning_symbols")
+            if is_title_by_len:
+                print_green(f"is_title_by_len: {is_title_by_len}")
+            else:
+                print_red(f"is_not_title_by_len: {is_title_by_len}")
+            if is_equation:
+                print_red(f"is_equation")
+            else:
+                print_green(f"is_not_equation")
+            if is_potential_title_font:
+                print_green(f"is_potential_title_font")
+            else:
+                print_red(f"is_not_potential_title_font")
+            if is_punctuation_heavy:
+                print_red("is_punctuation_heavy")
+            else:
+                print_green("is_not_punctuation_heavy")
+            if is_bold_font:
+                print_green(f"is_bold_font")
+            else:
+                print_red(f"is_not_bold_font")
+            if is_font_size_not_less_than_doc_avg:
+                print_green(f"is_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_larger_font_than_doc_avg")
+            if is_much_larger_font_than_doc_avg:
+                print_green(f"is_much_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_much_larger_font_than_doc_avg")
+            if is_not_same_font_type_of_docAvg:
+                print_green(f"is_not_same_font_type_of_docAvg")
+            else:
+                print_red(f"is_same_font_type_of_docAvg")
+            if is_word_list_line_by_rules:
+                print_red("is_word_list_line_by_rules")
+            else:
+                print_green("is_not_name_list_by_rules")
+            if is_person_or_org_list_line_by_nlp:
+                print_red("is_person_or_org_list_line_by_nlp")
+            else:
+                print_green("is_not_person_or_org_list_line_by_nlp")
+            if not is_numbered_title:
+                print_red("is_not_numbered_title")
+            else:
+                print_green("is_numbered_title")
+            if is_a_left_inline_title:
+                print_red("is_a_left_inline_title")
+            else:
+                print_green("is_not_a_left_inline_title")
+            if not is_title_by_check_prev_line:
+                print_red("is_not_title_by_check_prev_line")
+            else:
+                print_green("is_title_by_check_prev_line")
+            if not is_title_by_check_next_line:
+                print_red("is_not_title_by_check_next_line")
+            else:
+                print_green("is_title_by_check_next_line")
+            if not is_title_by_check_pre_and_next_line:
+                print_red("is_not_title_by_check_pre_and_next_line")
+            else:
+                print_green("is_title_by_check_pre_and_next_line")
+        # print_green("Common features:")
+        # print_green("↓" * 10)
+        # print(f"    curr_line_font_type: {curr_line_font_type}")
+        # print(f"    curr_line_font_size: {curr_line_font_size}")
+        # print()
+        """
+        return is_title, is_author_or_org_list
+    def _detect_title(self, input_block):
+        """
+        Use the functions 'is_potential_title' to detect titles of each paragraph block.
+        If a line is a title, then the value of key 'is_title' of the line will be set to True.
+        """
+        raw_lines = input_block["lines"]
+        prev_line_is_title_flag = False
+        for i, curr_line in enumerate(raw_lines):
+            prev_line = raw_lines[i - 1] if i > 0 else None
+            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
+            blk_avg_char_width = input_block["avg_char_width"]
+            blk_avg_char_height = input_block["avg_char_height"]
+            blk_media_font_size = input_block["median_font_size"]
+            is_title, is_author_or_org_list = self._is_potential_title(
+                curr_line,
+                prev_line,
+                prev_line_is_title_flag,
+                next_line,
+                blk_avg_char_width,
+                blk_avg_char_height,
+                blk_media_font_size,
+            )
+            if is_title:
+                curr_line["is_title"] = is_title
+                prev_line_is_title_flag = True
+            else:
+                curr_line["is_title"] = False
+                prev_line_is_title_flag = False
+            # print(f"curr_line['text']: {curr_line['text']}")
+            # print(f"curr_line['is_title']: {curr_line['is_title']}")
+            # print(f"prev_line['text']: {prev_line['text'] if prev_line else None}")
+            # print(f"prev_line_is_title_flag: {prev_line_is_title_flag}")
+            # print()
+            if is_author_or_org_list:
+                curr_line["is_author_or_org_list"] = is_author_or_org_list
+            else:
+                curr_line["is_author_or_org_list"] = False
+        return input_block
+    def batch_detect_titles(self, pdf_dic):
+        """
+        This function batch process the blocks to detect titles.
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        num_titles = 0
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    all_single_line_blocks = []
+                    for block in para_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+                    new_para_blocks = []
+                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_block = self._detect_title(para_block)
+                            new_para_blocks.append(new_block)
+                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                    else:  # All blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_para_blocks.append(para_block)
+                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
+                    para_blocks = new_para_blocks
+                blocks["para_blocks"] = para_blocks
+                for para_block in para_blocks:
+                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
+                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
+                    if (
+                        all_titles and para_text_len < 200
+                    ):  # total length of the paragraph is less than 200, more than this should not be a title
+                        para_block["is_block_title"] = 1
+                    else:
+                        para_block["is_block_title"] = 0
+                    all_name_or_org_list_to_be_removed = all(
+                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
+                    )
+                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
+                        para_block["is_block_an_author_or_org_list"] = 1
+                    else:
+                        para_block["is_block_an_author_or_org_list"] = 0
+        pdf_dic["statistics"]["num_titles"] = num_titles
+        return pdf_dic
+    def _recog_title_level(self, title_blocks):
+        """
+        This function determines the title level based on the font size of the title.
+        Parameters
+        ----------
+        title_blocks : list
+        Returns
+        -------
+        title_blocks : list
+        """
+        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
+        # Use the mean and std of font sizes to remove extreme values
+        mean_font_size = np.mean(font_sizes)
+        std_font_size = np.std(font_sizes)
+        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
+        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
+        # Compute the threshold for title level
+        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
+        if middle_font_sizes.size > 0:
+            middle_mean_font_size = np.mean(middle_font_sizes)
+            level_threshold = middle_mean_font_size
+        else:
+            level_threshold = mean_font_size
+        for tb in title_blocks:
+            title_block = tb["block"]
+            title_font_size = safe_get(title_block, "block_font_size", 0)
+            current_level = 1  # Initialize title level, the biggest level is 1
+            # print(f"Before adjustment by font size, {current_level}")
+            if title_font_size >= max_extreme_font_size:
+                current_level = 1
+            elif title_font_size <= min_extreme_font_size:
+                current_level = 3
+            elif float(title_font_size) >= float(level_threshold):
+                current_level = 2
+            else:
+                current_level = 3
+            # print(f"After adjustment by font size, {current_level}")
+            title_block["block_title_level"] = current_level
+        return title_blocks
+    def batch_recog_title_level(self, pdf_dic):
+        """
+        This function batch process the blocks to recognize title level.
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        title_blocks = []
+        # Collect all titles
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = blocks.get("para_blocks", [])
+                for block in para_blocks:
+                    if block.get("is_block_title"):
+                        title_obj = {"page_id": page_id, "block": block}
+                        title_blocks.append(title_obj)
+        # Determine title level
+        if title_blocks:
+            # Determine title level based on font size
+            title_blocks = self._recog_title_level(title_blocks)
+        return pdf_dic
+class BlockTerminationProcessor:
+    """
+    This class is used to process the block termination.
+    """
+    def __init__(self) -> None:
+        pass
+    def _is_consistent_lines(
+        self,
+        curr_line,
+        prev_line,
+        next_line,
+        consistent_direction,  # 0 for prev, 1 for next, 2 for both
+    ):
+        """
+        This function checks if the line is consistent with its neighbors
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        consistent_direction : int
+            0 for prev, 1 for next, 2 for both
+        Returns
+        -------
+        bool
+            True if the line is consistent with its neighbors, False otherwise.
+        """
+        curr_line_font_size = curr_line["spans"][0]["size"]
+        curr_line_font_type = curr_line["spans"][0]["font"].lower()
+        if consistent_direction == 0:
+            if prev_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
+            else:
+                return False
+        elif consistent_direction == 1:
+            if next_line:
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+            else:
+                return False
+        elif consistent_direction == 2:
+            if prev_line and next_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
+                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+                )
+            else:
+                return False
+        else:
+            return False
+    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
+        """
+        This function checks if the line is a regular line
+        Parameters
+        ----------
+        curr_line_bbox : list
+            bbox of the current line
+        prev_line_bbox : list
+            bbox of the previous line
+        next_line_bbox : list
+            bbox of the next line
+        avg_char_width : float
+            average of char widths
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_line_height : float
+            average of line heights
+        Returns
+        -------
+        bool
+            True if the line is a regular line, False otherwise.
+        """
+        horizontal_ratio = 0.5
+        vertical_ratio = 0.5
+        horizontal_thres = horizontal_ratio * avg_char_width
+        vertical_thres = vertical_ratio * avg_line_height
+        x0, y0, x1, y1 = curr_line_bbox
+        x0_near_X0 = abs(x0 - X0) < horizontal_thres
+        x1_near_X1 = abs(x1 - X1) < horizontal_thres
+        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+        return (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (not x0_near_X0 and not x1_near_X1)
+            or prev_line_is_end_of_para
+        )
+    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
+        """
+        This function checks if the line is a possible start of a paragraph
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        avg_line_height : float
+            average of line heights
+        Returns
+        -------
+        bool
+            True if the line is a possible start of a paragraph, False otherwise.
+        """
+        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
+        decision_path = []  # Record the decision path
+        curr_line_bbox = curr_line["bbox"]
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        next_line_bbox = next_line["bbox"] if next_line else None
+        indent_ratio = 1
+        vertical_ratio = 1.5
+        vertical_thres = vertical_ratio * avg_font_size
+        left_horizontal_ratio = 0.5
+        left_horizontal_thres = left_horizontal_ratio * avg_char_width
+        right_horizontal_ratio = 2.5
+        right_horizontal_thres = right_horizontal_ratio * avg_char_width
+        x0, y0, x1, y1 = curr_line_bbox
+        indent_condition = x0 > X0 + indent_ratio * avg_char_width
+        if indent_condition:
+            start_confidence += 0.2
+            decision_path.append("indent_condition_met")
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
+        if x0_near_X0:
+            start_confidence += 0.1
+            decision_path.append("x0_near_X0")
+        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
+        if x1_near_X1:
+            start_confidence += 0.1
+            decision_path.append("x1_near_X1")
+        if prev_line is None:
+            prev_line_is_end_of_para = True
+            start_confidence += 0.2
+            decision_path.append("no_prev_line")
+        else:
+            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
+            if prev_line_is_end_of_para:
+                start_confidence += 0.1
+                decision_path.append("prev_line_is_end_of_para")
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            if sufficient_spacing_above:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_above")
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            if sufficient_spacing_below:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_below")
+        is_regular_line = self._is_regular_line(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
+        )
+        if is_regular_line:
+            start_confidence += 0.1
+            decision_path.append("is_regular_line")
+        is_start_of_para = (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (indent_condition)
+            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
+            or prev_line_is_end_of_para
+        )
+        return (is_start_of_para, start_confidence, decision_path)
+    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
+        """
+        This function checks if the line is a possible end of a paragraph
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        Returns
+        -------
+        bool
+            True if the line is a possible end of a paragraph, False otherwise.
+        """
+        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
+        decision_path = []  # Record the decision path
+        curr_line_bbox = curr_line["bbox"]
+        next_line_bbox = next_line["bbox"] if next_line else None
+        left_horizontal_ratio = 0.5
+        right_horizontal_ratio = 0.5
+        x0, _, x1, y1 = curr_line_bbox
+        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
+        if x0_near_X0:
+            end_confidence += 0.1
+            decision_path.append("x0_near_X0")
+        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
+        if x1_smaller_than_X1:
+            end_confidence += 0.1
+            decision_path.append("x1_smaller_than_X1")
+        next_line_is_start_of_para = (
+            next_line_bbox
+            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
+            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
+        )
+        if next_line_is_start_of_para:
+            end_confidence += 0.2
+            decision_path.append("next_line_is_start_of_para")
+        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if is_line_left_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_left_aligned_from_neighbors")
+        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if not is_line_right_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_not_right_aligned_from_neighbors")
+        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
+            (x0_near_X0 and x1_smaller_than_X1)
+            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
+        )
+        return (is_end_of_para, end_confidence, decision_path)
+    def _cut_paras_per_block(
+        self,
+        block,
+    ):
+        """
+        Processes a raw block from PyMuPDF and returns the processed block.
+        Parameters
+        ----------
+        raw_block : dict
+            A raw block from pymupdf.
+        Returns
+        -------
+        processed_block : dict
+        """
+        def _construct_para(lines, is_block_title, para_title_level):
+            """
+            Construct a paragraph from given lines.
+            """
+            font_sizes = [span["size"] for line in lines for span in line["spans"]]
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
+            font_colors = [span["color"] for line in lines for span in line["spans"]]
+            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
+            font_type_lengths = {}
+            for line in lines:
+                for span in line["spans"]:
+                    font_type = span["font"]
+                    bbox_width = span["bbox"][2] - span["bbox"][0]
+                    if font_type in font_type_lengths:
+                        font_type_lengths[font_type] += bbox_width
+                    else:
+                        font_type_lengths[font_type] = bbox_width
+            # get the font type with the longest bbox width
+            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
+            para_bbox = calculate_para_bbox(lines)
+            para_text = " ".join(line["text"] for line in lines)
+            return {
+                "para_bbox": para_bbox,
+                "para_text": para_text,
+                "para_font_type": most_common_font_type,
+                "para_font_size": avg_font_size,
+                "para_font_color": most_common_font_color,
+                "is_para_title": is_block_title,
+                "para_title_level": para_title_level,
+            }
+        block_bbox = block["bbox"]
+        block_text = block["text"]
+        block_lines = block["lines"]
+        X0 = safe_get(block, "X0", 0)
+        X1 = safe_get(block, "X1", 0)
+        avg_char_width = safe_get(block, "avg_char_width", 0)
+        avg_char_height = safe_get(block, "avg_char_height", 0)
+        avg_font_size = safe_get(block, "avg_font_size", 0)
+        is_block_title = safe_get(block, "is_block_title", False)
+        para_title_level = safe_get(block, "block_title_level", 0)
+        # Segment into paragraphs
+        para_ranges = []
+        in_paragraph = False
+        start_idx_of_para = None
+        # Create the processed paragraphs
+        processed_paras = {}
+        para_bboxes = []
+        end_idx_of_para = 0
+        for line_index, line in enumerate(block_lines):
+            curr_line = line
+            prev_line = block_lines[line_index - 1] if line_index > 0 else None
+            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
+            """
+            Start processing paragraphs.
+            """
+            # Check if the line is the start of a paragraph
+            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
+                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
+            )
+            if not in_paragraph and is_start_of_para:
+                in_paragraph = True
+                start_idx_of_para = line_index
+                # print_green(">>> Start of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    start_confidence: ", start_confidence)
+                # print("    decision_path: ", decision_path)
+            # Check if the line is the end of a paragraph
+            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
+                curr_line, next_line, X0, X1, avg_char_width
+            )
+            if in_paragraph and (is_end_of_para or not next_line):
+                para_ranges.append((start_idx_of_para, line_index))
+                start_idx_of_para = None
+                in_paragraph = False
+                # print_red(">>> End of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    end_confidence: ", end_confidence)
+                # print("    decision_path: ", decision_path)
+        # Add the last paragraph if it is not added
+        if in_paragraph and start_idx_of_para is not None:
+            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
+        # Process the matched paragraphs
+        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
+            matched_lines = block_lines[start_idx : end_idx + 1]
+            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
+            para_key = f"para_{len(processed_paras)}"
+            processed_paras[para_key] = para_properties
+            para_bboxes.append(para_properties["para_bbox"])
+            end_idx_of_para = end_idx + 1
+        # Deal with the remaining lines
+        if end_idx_of_para < len(block_lines):
+            unmatched_lines = block_lines[end_idx_of_para:]
+            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
+            unmatched_key = f"para_{len(processed_paras)}"
+            processed_paras[unmatched_key] = unmatched_properties
+            para_bboxes.append(unmatched_properties["para_bbox"])
+        block["paras"] = processed_paras
+        return block
+    def batch_process_blocks(self, pdf_dict):
+        """
+        Parses the blocks of all pages.
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        filter_blocks : list
+            List of bounding boxes to filter.
+        Returns
+        -------
+        result_dict : dict
+            Result dictionary.
+        """
+        num_paras = 0
+        for page_id, page in pdf_dict.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page.keys():
+                    input_blocks = page["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self._cut_paras_per_block(input_block)
+                        para_blocks.append(new_block)
+                        num_paras += len(new_block["paras"])
+                page["para_blocks"] = para_blocks
+        pdf_dict["statistics"]["num_paras"] = num_paras
+        return pdf_dict
+class BlockContinuationProcessor:
+    """
+    This class is used to process the blocks to detect block continuations.
+    """
+    def __init__(self) -> None:
+        pass
+    def __is_similar_font_type(self, font_type_1, font_type_2, prefix_length_ratio=0.3):
+        """
+        This function checks if the two font types are similar.
+        Definition of similar font types: the two font types have a common prefix,
+        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
+        Parameters
+        ----------
+        font_type1 : str
+            font type 1
+        font_type2 : str
+            font type 2
+        prefix_length_ratio : float
+            minimum ratio of the common prefix length to the length of the shorter font type
+        Returns
+        -------
+        bool
+            True if the two font types are similar, False otherwise.
+        """
+        if isinstance(font_type_1, list):
+            font_type_1 = font_type_1[0] if font_type_1 else ""
+        if isinstance(font_type_2, list):
+            font_type_2 = font_type_2[0] if font_type_2 else ""
+        if font_type_1 == font_type_2:
+            return True
+        # Find the length of the common prefix
+        common_prefix_length = len(os.path.commonprefix([font_type_1, font_type_2]))
+        # Calculate the minimum prefix length based on the ratio
+        min_prefix_length = int(min(len(font_type_1), len(font_type_2)) * prefix_length_ratio)
+        return common_prefix_length >= min_prefix_length
+    def __is_same_block_font(self, block_1, block_2):
+        """
+        This function compares the font of block1 and block2
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 have the same font, else False
+        """
+        block_1_font_type = safe_get(block_1, "block_font_type", "")
+        block_1_font_size = safe_get(block_1, "block_font_size", 0)
+        block_1_avg_char_width = safe_get(block_1, "avg_char_width", 0)
+        block_2_font_type = safe_get(block_2, "block_font_type", "")
+        block_2_font_size = safe_get(block_2, "block_font_size", 0)
+        block_2_avg_char_width = safe_get(block_2, "avg_char_width", 0)
+        if isinstance(block_1_font_size, list):
+            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
+        if isinstance(block_2_font_size, list):
+            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
+        block_1_text = safe_get(block_1, "text", "")
+        block_2_text = safe_get(block_2, "text", "")
+        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
+            return False
+        if not block_1_text or not block_2_text:
+            return False
+        else:
+            text_len_ratio = len(block_2_text) / len(block_1_text)
+            if text_len_ratio < 0.2:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.5
+                )
+            else:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.2
+                )
+        block_font_size_condition = abs(block_1_font_size - block_2_font_size) < 1
+        return (
+            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
+            and avg_char_width_condition
+            and block_font_size_condition
+        )
+    def _is_alphabet_char(self, char):
+        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
+            return True
+        else:
+            return False
+    def _is_chinese_char(self, char):
+        if char >= "\u4e00" and char <= "\u9fa5":
+            return True
+        else:
+            return False
+    def _is_other_letter_char(self, char):
+        try:
+            cat = unicodedata.category(char)
+            if cat == "Lu" or cat == "Ll":
+                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
+        except TypeError:
+            print("The input to the function must be a single character.")
+        return False
+    def _is_year(self, s: str):
+        try:
+            number = int(s)
+            return 1900 <= number <= 2099
+        except ValueError:
+            return False
+    def _match_brackets(self, text):
+        # pattern = r"^[\(\)\[\]（）【】{}｛｝<>＜＞〔〕〘〙\"\'“”‘’]"
+        pattern = r"^[\(\)\]（）】{}｛｝>＞〕〙\"\'“”‘’]"
+        return bool(re.match(pattern, text))
+    def _is_para_font_consistent(self, para_1, para_2):
+        """
+        This function compares the font of para1 and para2
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 have the same font, else False
+        """
+        if para_1 is None or para_2 is None:
+            return False
+        para_1_font_type = safe_get(para_1, "para_font_type", "")
+        para_1_font_size = safe_get(para_1, "para_font_size", 0)
+        para_1_font_color = safe_get(para_1, "para_font_color", "")
+        para_2_font_type = safe_get(para_2, "para_font_type", "")
+        para_2_font_size = safe_get(para_2, "para_font_size", 0)
+        para_2_font_color = safe_get(para_2, "para_font_color", "")
+        if isinstance(para_1_font_type, list):  # get the most common font type
+            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
+        if isinstance(para_2_font_type, list):
+            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
+        if isinstance(para_1_font_size, list):  # compute average font type
+            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
+        if isinstance(para_2_font_size, list):  # compute average font type
+            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
+        return (
+            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
+            and abs(para_1_font_size - para_2_font_size) < 1.5
+            # and para_font_color1 == para_font_color2
+        )
+    def _is_para_puncs_consistent(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph by using the puncs, else False
+        """
+        para_1_text = safe_get(para_1, "para_text", "").strip()
+        para_2_text = safe_get(para_2, "para_text", "").strip()
+        para_1_bboxes = safe_get(para_1, "para_bbox", [])
+        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
+        para_2_bboxes = safe_get(para_2, "para_bbox", [])
+        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
+        # print_yellow("    Features of determine puncs_consistent:")
+        # print(f"    para_1_text: {para_1_text}")
+        # print(f"    para_2_text: {para_2_text}")
+        # print(f"    para_1_bboxes: {para_1_bboxes}")
+        # print(f"    para_2_bboxes: {para_2_bboxes}")
+        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
+        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
+        if is_nested_list(para_1_bboxes):
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
+        else:
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
+        if is_nested_list(para_2_bboxes):
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
+            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
+        else:
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
+        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
+        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
+        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
+        # Check if either para_text1 or para_text2 is empty
+        if not para_1_text or not para_2_text:
+            return False
+        # Define the end puncs for a sentence to end and hyphen
+        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+        hyphen = ["-", "—"]
+        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
+        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
+        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
+        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
+        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
+        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
+        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
+        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
+        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
+        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
+            # print_red(f"para_1 is end with hyphen.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] in hyphen
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
+            # print_red(f"para_1 is end with end_punc.")
+            para_2_is_consistent = (
+                para_2_text
+                and (
+                    para_2_text[0]
+                    == " "
+                    # or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
+                    # or (self._is_chinese_char(para_2_text[0]))
+                    # or (self._is_other_letter_char(para_2_text[0]))
+                )
+                and not is_para2_left_indent_than_papa1
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
+            # print_red(f"para_1 is NOT end with end_punc.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_alphabet_char(para_2_text[0]))
+                or (self._is_year(para_2_text[0:4]))
+                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+                or (self._match_brackets(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+        elif para_1_end_with_space:  # If para_text1 ends with space
+            # print_red(f"para_1 is end with space.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                pass
+                # print(f"para_2 is not consistent.\n")
+        return False
+    def _is_block_consistent(self, block_1, block_2):
+        """
+        This function determines whether block1 and block2 are originally from the same block
+        Parameters
+        ----------
+        block1 : dict
+            block1s
+        block2 : dict
+            block2
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 are from the same block, else False
+        """
+        return self.__is_same_block_font(block_1, block_2)
+    def _is_para_continued(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph, else False
+        """
+        is_para_font_consistent = self._is_para_font_consistent(para_1, para_2)
+        is_para_puncs_consistent = self._is_para_puncs_consistent(para_1, para_2)
+        return is_para_font_consistent and is_para_puncs_consistent
+    def _are_boundaries_of_block_consistent(self, block_1, block_2):
+        """
+        This function checks if the boundaries of block1 and block2 are consistent
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+        Returns
+        -------
+        is_consistent : bool
+            True if the boundaries of block1 and block2 are consistent, else False
+        """
+        last_line_of_block_1 = block_1["lines"][-1]
+        first_line_of_block_2 = block_2["lines"][0]
+        spans_of_last_line_of_block_1 = last_line_of_block_1["spans"]
+        spans_of_first_line_of_block_2 = first_line_of_block_2["spans"]
+        font_type_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["font"].lower()
+        font_size_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["size"]
+        font_color_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["color"]
+        font_flags_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["flags"]
+        font_type_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["font"].lower()
+        font_size_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["size"]
+        font_color_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["color"]
+        font_flags_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["flags"]
+        return (
+            self.__is_similar_font_type(font_type_of_last_line_of_block_1, font_type_of_first_line_of_block_2)
+            and abs(font_size_of_last_line_of_block_1 - font_size_of_first_line_of_block_2) < 1
+            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
+            and font_flags_of_last_line_of_block_1 == font_flags_of_first_line_of_block_2
+        )
+    def should_merge_next_para(self, curr_para, next_para):
+        """
+        This function checks if the next_para should be merged into the curr_para.
+        Parameters
+        ----------
+        curr_para : dict
+            The current paragraph.
+        next_para : dict
+            The next paragraph.
+        Returns
+        -------
+        bool
+            True if the next_para should be merged into the curr_para, False otherwise.
+        """
+        if self._is_para_continued(curr_para, next_para):
+            return True
+        else:
+            return False
+    def batch_tag_paras(self, pdf_dict):
+        """
+        This function tags the paragraphs in the pdf_dict.
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with tagged paragraphs.
+        """
+        the_last_page_id = len(pdf_dict) - 1
+        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
+            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
+                para_blocks_of_curr_page = curr_page_content["para_blocks"]
+                next_page_idx = curr_page_idx + 1
+                next_page_id = f"page_{next_page_idx}"
+                next_page_content = pdf_dict.get(next_page_id, {})
+                for i, current_block in enumerate(para_blocks_of_curr_page):
+                    for para_id, curr_para in current_block["paras"].items():
+                        curr_para["curr_para_location"] = [
+                            curr_page_idx,
+                            current_block["block_id"],
+                            int(para_id.split("_")[-1]),
+                        ]
+                        curr_para["next_para_location"] = None  # 默认设置为None
+                        curr_para["merge_next_para"] = False  # 默认设置为False
+                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
+                    if next_block:
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+                        next_block_first_para_key = list(next_block["paras"].keys())[0]
+                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
+                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                            curr_blk_last_para["next_para_location"] = [
+                                curr_page_idx,
+                                next_block["block_id"],
+                                int(next_block_first_para_key.split("_")[-1]),
+                            ]
+                            curr_blk_last_para["merge_next_para"] = True
+                    else:
+                        # Handle the case where the next block is in a different page
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
+                            next_page_idx += 1
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id, {})
+                        if next_page_content.get("para_blocks", []):
+                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
+                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
+                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                                curr_blk_last_para["next_para_location"] = [
+                                    next_page_idx,
+                                    next_page_content["para_blocks"][0]["block_id"],
+                                    int(next_blk_first_para_key.split("_")[-1]),
+                                ]
+                                curr_blk_last_para["merge_next_para"] = True
+        return pdf_dict
+    def find_block_by_id(self, para_blocks, block_id):
+        """
+        This function finds a block by its id.
+        Parameters
+        ----------
+        para_blocks : list
+            List of blocks.
+        block_id : int
+            Id of the block to find.
+        Returns
+        -------
+        block : dict
+            The block with the given id.
+        """
+        for blk_idx, block in enumerate(para_blocks):
+            if block.get("block_id") == block_id:
+                return block
+        return None
+    def batch_merge_paras(self, pdf_dict):
+        """
+        This function merges the paragraphs in the pdf_dict.
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with merged paragraphs.
+        """
+        for page_id, page_content in pdf_dict.items():
+            if page_id.startswith("page_") and page_content.get("para_blocks", []):
+                para_blocks_of_page = page_content["para_blocks"]
+                for i in range(len(para_blocks_of_page)):
+                    current_block = para_blocks_of_page[i]
+                    paras = current_block["paras"]
+                    for para_id, curr_para in list(paras.items()):
+                        # print(f"current para_id: {para_id}")
+                        # 跳过标题段落
+                        if curr_para.get("is_para_title"):
+                            continue
+                        while curr_para.get("merge_next_para"):
+                            curr_para_location = curr_para.get("curr_para_location")
+                            next_para_location = curr_para.get("next_para_location")
+                            # print(f"curr_para_location: {curr_para_location}, next_para_location: {next_para_location}")
+                            if not next_para_location:
+                                break
+                            if curr_para_location == next_para_location:
+                                # print_red("The next para is in the same block as the current para.")
+                                curr_para["merge_next_para"] = False
+                                break
+                            next_page_idx, next_block_id, next_para_id = next_para_location
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id)
+                            if not next_page_content:
+                                break
+                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
+                            if not next_block:
+                                break
+                            next_para = next_block["paras"].get(f"para_{next_para_id}")
+                            if not next_para or next_para.get("is_para_title"):
+                                break
+                            # 合并段落文本
+                            curr_para_text = curr_para.get("para_text", "")
+                            next_para_text = next_para.get("para_text", "")
+                            curr_para["para_text"] = curr_para_text + " " + next_para_text
+                            # 更新 next_para_location
+                            curr_para["next_para_location"] = next_para.get("next_para_location")
+                            # 将下一个段落文本置为空，表示已被合并
+                            next_para["para_text"] = ""
+                            # 更新 merge_next_para 标记
+                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
+        return pdf_dict
+class DrawAnnos:
+    """
+    This class draws annotations on the pdf file
+    ----------------------------------------
+                Color Code
+    ----------------------------------------
+        Red: (1, 0, 0)
+        Green: (0, 1, 0)
+        Blue: (0, 0, 1)
+        Yellow: (1, 1, 0) - mix of red and green
+        Cyan: (0, 1, 1) - mix of green and blue
+        Magenta: (1, 0, 1) - mix of red and blue
+        White: (1, 1, 1) - red, green and blue full intensity
+        Black: (0, 0, 0) - no color component whatsoever
+        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
+        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
+    """
+    def __init__(self) -> None:
+        pass
+    def __is_nested_list(self, lst):
+        """
+        This function returns True if the given list is a nested list of any degree.
+        """
+        if isinstance(lst, list):
+            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
+        return False
+    def __valid_rect(self, bbox):
+        # Ensure that the rectangle is not empty or invalid
+        if isinstance(bbox[0], list):
+            return False  # It's a nested list, hence it can't be valid rect
+        else:
+            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
+    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
+        """
+        This function draws the nested boxes
+        Parameters
+        ----------
+        page : fitz.Page
+            page
+        nested_bbox : list
+            nested bbox
+        color : tuple
+            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
+        """
+        if self.__is_nested_list(nested_bbox):  # If it's a nested list
+            for bbox in nested_bbox:
+                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
+        elif self.__valid_rect(nested_bbox):  # If valid rectangle
+            para_rect = fitz.Rect(nested_bbox)
+            para_anno = page.add_rect_annot(para_rect)
+            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
+            para_anno.set_border(width=1)
+            para_anno.update()
+    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
+        """
+        This function draws annotations on the pdf file.
+        Parameters
+        ----------
+        input_pdf_path : str
+            path to the input pdf file
+        pdf_dic : dict
+            pdf dictionary
+        output_pdf_path : str
+            path to the output pdf file
+        pdf_dic : dict
+            pdf dictionary
+        """
+        pdf_doc = open_pdf(input_pdf_path)
+        if pdf_dic is None:
+            pdf_dic = {}
+        if output_pdf_path is None:
+            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
+        for page_id, page in enumerate(pdf_doc):  # type: ignore
+            page_key = f"page_{page_id}"
+            for ele_key, ele_data in pdf_dic[page_key].items():
+                if ele_key == "para_blocks":
+                    para_blocks = ele_data
+                    for para_block in para_blocks:
+                        if "paras" in para_block.keys():
+                            paras = para_block["paras"]
+                            for para_key, para_content in paras.items():
+                                para_bbox = para_content["para_bbox"]
+                                # print(f"para_bbox: {para_bbox}")
+                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
+                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
+                                    color = (0, 1, 1)
+                                    self.__draw_nested_boxes(
+                                        page, para_bbox, color
+                                    )  # draw with cyan color for combined paragraph
+                                else:
+                                    if self.__valid_rect(para_bbox):
+                                        para_rect = fitz.Rect(para_bbox)
+                                        para_anno = page.add_rect_annot(para_rect)
+                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
+                                        para_anno.set_border(width=0.5)
+                                        para_anno.update()
+                                is_para_title = para_content["is_para_title"]
+                                if is_para_title:
+                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
+                                        color = (0, 0, 1)
+                                        self.__draw_nested_boxes(
+                                            page, para_content["para_bbox"], color
+                                        )  # draw with cyan color for combined title
+                                    else:
+                                        if self.__valid_rect(para_content["para_bbox"]):
+                                            para_rect = fitz.Rect(para_content["para_bbox"])
+                                            if self.__valid_rect(para_content["para_bbox"]):
+                                                para_anno = page.add_rect_annot(para_rect)
+                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
+                                                para_anno.set_border(width=0.5)
+                                                para_anno.update()
+        pdf_doc.save(output_pdf_path)
+        pdf_doc.close()
+class ParaProcessPipeline:
+    def __init__(self) -> None:
+        pass
+    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
+        """
+        This function processes the paragraphs, including:
+        1. Read raw input json file into pdf_dic
+        2. Detect and replace equations
+        3. Combine spans into a natural line
+        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        5. Compute statistics for each block
+        6. Detect titles in the document
+        7. Detect paragraphs inside each block
+        8. Divide the level of the titles
+        9. Detect and combine paragraphs from different blocks into one paragraph
+        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
+        11. Draw annotations on the pdf file
+        Parameters
+        ----------
+        pdf_dic_json_fpath : str
+            path to the pdf dictionary json file.
+            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
+        input_pdf_doc : str
+            path to the input pdf file
+        output_pdf_path : str
+            path to the output pdf file
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        error_info = None
+        output_json_file = ""
+        output_dir = ""
+        if input_pdf_path is not None:
+            input_pdf_path = os.path.abspath(input_pdf_path)
+            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
+        if output_pdf_path is not None:
+            output_dir = os.path.dirname(output_pdf_path)
+            output_json_file = f"{output_dir}/pdf_dic.json"
+        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
+            """
+            Save the pdf_dic to a json file
+            """
+            output_pdf_file_name = os.path.basename(output_pdf_path)
+            # output_dir = os.path.dirname(output_pdf_path)
+            output_dir = "\\tmp\\pdf_parse"
+            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
+            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            if para_debug_mode == "full":
+                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
+                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
+            # Validate the output already exists
+            if not os.path.exists(pdf_dic_json_fpath):
+                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
+                return None
+            else:
+                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
+            return pdf_dic_json_fpath
+        """
+        Preprocess the lines of block
+        """
+        # Combine spans into a natural line
+        rawBlockProcessor = RawBlockProcessor()
+        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        layoutFilter = LayoutFilterProcessor()
+        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
+        # Compute statistics for each block
+        blockStatisticsCalculator = BlockStatisticsCalculator()
+        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+        # Compute statistics for all blocks(namely this pdf document)
+        docStatisticsCalculator = DocStatisticsCalculator()
+        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
+        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
+        # Dump the first three stages of pdf_dic to a json file
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
+        """
+        Detect titles in the document
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
+        """
+        Detect and divide the level of the titles
+        """
+        titleProcessor = TitleProcessor()
+        pdf_dic = titleProcessor.batch_recog_title_level(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
+        """
+        Detect and split paragraphs inside each block
+        """
+        blockInnerParasProcessor = BlockTerminationProcessor()
+        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+        """
+        Detect and combine paragraphs from different blocks into one paragraph
+        """
+        blockContinuationProcessor = BlockContinuationProcessor()
+        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
+        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+        """
+        Discard pdf files by checking exceptions and return the error info to the caller
+        """
+        discardByException = DiscardByException()
+        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
+            pdf_dic, exception=DenseSingleLineBlockException()
+        )
+        is_discard_by_title_detection = discardByException.discard_by_title_detection(
+            pdf_dic, exception=TitleDetectionException()
+        )
+        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
+        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
+        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
+        if is_discard_by_single_line_block is not None:
+            error_info = is_discard_by_single_line_block
+        elif is_discard_by_title_detection is not None:
+            error_info = is_discard_by_title_detection
+        elif is_discard_by_title_level is not None:
+            error_info = is_discard_by_title_level
+        elif is_discard_by_split_para is not None:
+            error_info = is_discard_by_split_para
+        elif is_discard_by_merge_para is not None:
+            error_info = is_discard_by_merge_para
+        if error_info is not None:
+            return pdf_dic, error_info
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+        """
+        Draw the annotations
+        """
+        if para_debug_mode is not None:
+            drawAnnos = DrawAnnos()
+            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
+        """
+        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
+        """
+        if para_debug_mode is not None:
+            for fpath in os.listdir(output_dir):
+                if fpath.endswith(".json") and "stage" in fpath:
+                    os.remove(os.path.join(output_dir, fpath))
+        return pdf_dic, error_info
+"""
+Run this script to test the function with Command: 
+python pdf2text_recogPara.py [pdf_path] [output_pdf_path]
+Params:
+- pdf_path: the path of the pdf file
+- output_pdf_path: the path of the output pdf file
+"""
+if __name__ == "__main__":
+    DEFAULT_PDF_PATH = (
+        "app/pdf_toolbox/test/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\test\\assets\\paper\\paper.pdf"
+    )
+    input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
+    output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
+    output_json_path = sys.argv[3] if len(sys.argv) > 3 else input_pdf_path.split(".")[0] + "_recogPara.json"
+    import stat
+    # Remove existing output file if it exists
+    if os.path.exists(output_pdf_path):
+        os.chmod(output_pdf_path, stat.S_IWRITE)
+        os.remove(output_pdf_path)
+    input_pdf_doc = open_pdf(input_pdf_path)
+    # postprocess the paragraphs
+    paraProcessPipeline = ParaProcessPipeline()
+    # parse paragraph and save to json file
+    pdf_dic = {}
+    blockInnerParasProcessor = BlockTerminationProcessor()
+    """
+    Construct the pdf dictionary.
+    """
+    for page_id, page in enumerate(input_pdf_doc):  # type: ignore
+        # print(f"Processing page {page_id}")
+        # print(f"page: {page}")
+        raw_blocks = page.get_text("dict")["blocks"]
+        # Save text blocks to "preproc_blocks"
+        preproc_blocks = []
+        for block in raw_blocks:
+            if block["type"] == 0:
+                preproc_blocks.append(block)
+        layout_bboxes = []
+        # Construct the pdf dictionary as schema above
+        page_dict = {
+            "para_blocks": None,
+            "preproc_blocks": preproc_blocks,
+            "images": None,
+            "tables": None,
+            "interline_equations": None,
+            "inline_equations": None,
+            "layout_bboxes": None,
+            "pymu_raw_blocks": None,
+            "global_statistic": None,
+            "droped_text_block": None,
+            "droped_image_block": None,
+            "droped_table_block": None,
+            "image_backup": None,
+            "table_backup": None,
+        }
+        pdf_dic[f"page_{page_id}"] = page_dict
+    # print(f"pdf_dic: {pdf_dic}")
+    with open(output_json_path, "w", encoding="utf-8") as f:
+        json.dump(pdf_dic, f, ensure_ascii=False, indent=4)
+    pdf_dic = paraProcessPipeline.para_process_pipeline(output_json_path, input_pdf_doc, output_pdf_path)
--- a/pdf2text_recogPara_v2.py
+++ b/pdf2text_recogPara_v2.py
+import os
+import sys
+import json
+import re
+import math
+import unicodedata
+from collections import Counter
+import numpy as np
+from termcolor import cprint
+from libs.commons import fitz
+from libs.nlp_utils import NLPModels
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+def open_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)  # type: ignore
+        return pdf_document
+    except Exception as e:
+        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
+        raise e
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+def print_yellow(text):
+    print()
+    cprint(text, "yellow", attrs=["bold"], end="\n\n")
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+def is_bbox_overlap(bbox1, bbox2):
+    """
+    This function checks if bbox1 and bbox2 overlap or not
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+    Returns
+    -------
+    bool
+        True if bbox1 and bbox2 overlap, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+    if x0_1 > x1_2 or x0_2 > x1_1:
+        return False
+    if y0_1 > y1_2 or y0_2 > y1_1:
+        return False
+    return True
+def is_in_bbox(bbox1, bbox2):
+    """
+    This function checks if bbox1 is in bbox2
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+    Returns
+    -------
+    bool
+        True if bbox1 is in bbox2, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
+        return True
+    else:
+        return False
+def calculate_para_bbox(lines):
+    """
+    This function calculates the minimum bbox of the paragraph
+    Parameters
+    ----------
+    lines : list
+        lines
+    Returns
+    -------
+    para_bbox : list
+        bbox of the paragraph
+    """
+    x0 = min(line["bbox"][0] for line in lines)
+    y0 = min(line["bbox"][1] for line in lines)
+    x1 = max(line["bbox"][2] for line in lines)
+    y1 = max(line["bbox"][3] for line in lines)
+    return [x0, y0, x1, y1]
+def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is right aligned from its neighbors
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+    Returns
+    -------
+    bool
+        True if the line is right aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+    _, _, x1, _ = curr_line_bbox
+    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+    if direction == 0:
+        return abs(x1 - prev_x1) < horizontal_thres
+    elif direction == 1:
+        return abs(x1 - next_x1) < horizontal_thres
+    elif direction == 2:
+        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
+    else:
+        return False
+def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is left aligned from its neighbors
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+    Returns
+    -------
+    bool
+        True if the line is left aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+    x0, _, _, _ = curr_line_bbox
+    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+    if direction == 0:
+        return abs(x0 - prev_x0) < horizontal_thres
+    elif direction == 1:
+        return abs(x0 - next_x0) < horizontal_thres
+    elif direction == 2:
+        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
+    else:
+        return False
+def end_with_punctuation(line_text):
+    """
+    This function checks if the line ends with punctuation marks
+    """
+    english_end_puncs = [".", "?", "!"]
+    chinese_end_puncs = ["。", "？", "！"]
+    end_puncs = english_end_puncs + chinese_end_puncs
+    last_non_space_char = None
+    for ch in line_text[::-1]:
+        if not ch.isspace():
+            last_non_space_char = ch
+            break
+    if last_non_space_char is None:
+        return False
+    return last_non_space_char in end_puncs
+def is_nested_list(lst):
+    if isinstance(lst, list):
+        return any(isinstance(sub, list) for sub in lst)
+    return False
+class DenseSingleLineBlockException(Exception):
+    """
+    This class defines the exception type for dense single line-block.
+    """
+    def __init__(self, message="DenseSingleLineBlockException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class TitleDetectionException(Exception):
+    """
+    This class defines the exception type for title detection.
+    """
+    def __init__(self, message="TitleDetectionException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class TitleLevelException(Exception):
+    """
+    This class defines the exception type for title level.
+    """
+    def __init__(self, message="TitleLevelException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class ParaSplitException(Exception):
+    """
+    This class defines the exception type for paragraph splitting.
+    """
+    def __init__(self, message="ParaSplitException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class ParaMergeException(Exception):
+    """
+    This class defines the exception type for paragraph merging.
+    """
+    def __init__(self, message="ParaMergeException"):
+        self.message = message
+        super().__init__(self.message)
+    def __str__(self):
+        return f"{self.message}"
+    def __repr__(self):
+        return f"{self.message}"
+class DiscardByException:
+    """
+    This class discards pdf files by exception
+    """
+    def __init__(self) -> None:
+        pass
+    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
+        """
+        This function discards pdf files by single line block exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        exception_page_nums = 0
+        page_num = 0
+        for page_id, page in pdf_dic.items():
+            if page_id.startswith("page_"):
+                page_num += 1
+                if "preproc_blocks" in page.keys():
+                    preproc_blocks = page["preproc_blocks"]
+                    all_single_line_blocks = []
+                    for block in preproc_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
+                        exception_page_nums += 1
+        if page_num == 0:
+            return None
+        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
+            return exception.message
+        return None
+    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
+        """
+        This function discards pdf files by title detection exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
+        """
+        This function discards pdf files by title level exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
+        """
+        This function discards pdf files by split para exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
+        """
+        This function discards pdf files by merge para exception
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+class LayoutFilterProcessor:
+    def __init__(self) -> None:
+        pass
+    def batch_process_blocks(self, pdf_dict):
+        """
+        This function processes the blocks in batch.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        pdf_dict : dict
+            pdf dictionary
+        Returns
+        -------
+        pdf_dict : dict
+            pdf dictionary
+        """
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
+                    layout_bbox_objs = blocks["layout_bboxes"]
+                    if layout_bbox_objs is None:
+                        continue
+                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
+                    # Enlarge each value of x0, y0, x1, y1 for each layout_bbox to prevent loss of text.
+                    layout_bboxes = [
+                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
+                    ]
+                    para_blocks = blocks["para_blocks"]
+                    if para_blocks is None:
+                        continue
+                    for lb_bbox in layout_bboxes:
+                        for i, para_block in enumerate(para_blocks):
+                            para_bbox = para_block["bbox"]
+                            para_blocks[i]["in_layout"] = 0
+                            if is_in_bbox(para_bbox, lb_bbox):
+                                para_blocks[i]["in_layout"] = 1
+                    blocks["para_blocks"] = para_blocks
+        return pdf_dict
+class RawBlockProcessor:
+    def __init__(self) -> None:
+        self.y_tolerance = 2
+        self.pdf_dic = {}
+    def __span_flags_decomposer(self, span_flags):
+        """
+        Make font flags human readable.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        span_flags : int
+            span flags
+        Returns
+        -------
+        l : dict
+            decomposed flags
+        """
+        l = {
+            "is_superscript": False,
+            "is_italic": False,
+            "is_serifed": False,
+            "is_sans_serifed": False,
+            "is_monospaced": False,
+            "is_proportional": False,
+            "is_bold": False,
+        }
+        if span_flags & 2**0:
+            l["is_superscript"] = True  # 表示上标
+        if span_flags & 2**1:
+            l["is_italic"] = True  # 表示斜体
+        if span_flags & 2**2:
+            l["is_serifed"] = True  # 表示衬线字体
+        else:
+            l["is_sans_serifed"] = True  # 表示非衬线字体
+        if span_flags & 2**3:
+            l["is_monospaced"] = True  # 表示等宽字体
+        else:
+            l["is_proportional"] = True  # 表示比例字体
+        if span_flags & 2**4:
+            l["is_bold"] = True  # 表示粗体
+        return l
+    def __make_new_lines(self, raw_lines):
+        """
+        This function makes new lines.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        raw_lines : list
+            raw lines
+        Returns
+        -------
+        new_lines : list
+            new lines
+        """
+        new_lines = []
+        new_line = None
+        for raw_line in raw_lines:
+            raw_line_bbox = raw_line["bbox"]
+            raw_line_spans = raw_line["spans"]
+            raw_line_text = "".join([span["text"] for span in raw_line_spans])
+            raw_line_dir = raw_line.get("dir", None)
+            decomposed_line_spans = []
+            for span in raw_line_spans:
+                raw_flags = span["flags"]
+                decomposed_flags = self.__span_flags_decomposer(raw_flags)
+                span["decomposed_flags"] = decomposed_flags
+                decomposed_line_spans.append(span)
+            if new_line is None:  # Handle the first line
+                new_line = {
+                    "bbox": raw_line_bbox,
+                    "text": raw_line_text,
+                    "dir": raw_line_dir if raw_line_dir else (0, 0),
+                    "spans": decomposed_line_spans,
+                }
+            else:  # Handle the rest lines
+                if (
+                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
+                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
+                ):
+                    new_line["bbox"] = (
+                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
+                        new_line["bbox"][1],  # top
+                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
+                        raw_line_bbox[3],  # bottom
+                    )
+                    new_line["text"] += raw_line_text
+                    new_line["spans"].extend(raw_line_spans)
+                    new_line["dir"] = (
+                        new_line["dir"][0] + raw_line_dir[0],
+                        new_line["dir"][1] + raw_line_dir[1],
+                    )
+                else:
+                    new_lines.append(new_line)
+                    new_line = {
+                        "bbox": raw_line_bbox,
+                        "text": raw_line_text,
+                        "dir": raw_line_dir if raw_line_dir else (0, 0),
+                        "spans": raw_line_spans,
+                    }
+        if new_line:
+            new_lines.append(new_line)
+        return new_lines
+    def __make_new_block(self, raw_block):
+        """
+        This function makes a new block.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        raw_block : dict
+            a raw block
+        Returns
+        -------
+        new_block : dict
+        """
+        new_block = {}
+        block_id = raw_block["number"]
+        block_bbox = raw_block["bbox"]
+        block_text = "".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
+        raw_lines = raw_block["lines"]
+        block_lines = self.__make_new_lines(raw_lines)
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["lines"] = block_lines
+        return new_block
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "preproc_blocks" in blocks.keys():
+                    input_blocks = blocks["preproc_blocks"]
+                    for raw_block in input_blocks:
+                        new_block = self.__make_new_block(raw_block)
+                        para_blocks.append(new_block)
+                blocks["para_blocks"] = para_blocks
+        return pdf_dic
+class BlockStatisticsCalculator:
+    """
+    This class calculates the statistics of the block.
+    """
+    def __init__(self) -> None:
+        pass
+    def __calc_stats_of_new_lines(self, new_lines):
+        """
+        This function calculates the paragraph metrics
+        Parameters
+        ----------
+        combined_lines : list
+            combined lines
+        Returns
+        -------
+        X0 : float
+            Median of x0 values, which represents the left average boundary of the block
+        X1 : float
+            Median of x1 values, which represents the right average boundary of the block
+        avg_char_width : float
+            Average of char widths, which represents the average char width of the block
+        avg_char_height : float
+            Average of line heights, which represents the average line height of the block
+        """
+        x0_values = []
+        x1_values = []
+        char_widths = []
+        char_heights = []
+        block_font_types = []
+        block_font_sizes = []
+        block_directions = []
+        if len(new_lines) > 0:
+            for i, line in enumerate(new_lines):
+                line_bbox = line["bbox"]
+                line_text = line["text"]
+                line_spans = line["spans"]
+                num_chars = len([ch for ch in line_text if not ch.isspace()])
+                x0_values.append(line_bbox[0])
+                x1_values.append(line_bbox[2])
+                if num_chars > 0:
+                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
+                    char_widths.append(char_width)
+                for span in line_spans:
+                    block_font_types.append(span["font"])
+                    block_font_sizes.append(span["size"])
+                if "dir" in line:
+                    block_directions.append(line["dir"])
+                # line_font_types = [span["font"] for span in line_spans]
+                char_heights = [span["size"] for span in line_spans]
+        X0 = np.median(x0_values) if x0_values else 0
+        X1 = np.median(x1_values) if x1_values else 0
+        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
+        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
+        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
+        max_span_length = 0
+        max_span_font_type = None
+        for line in new_lines:
+            line_spans = line["spans"]
+            for span in line_spans:
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_span_length:
+                    max_span_length = span_length
+                    max_span_font_type = span["font"]
+        max_freq_font_type = max_span_font_type
+        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
+        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
+        return (
+            X0,
+            X1,
+            avg_char_width,
+            avg_char_height,
+            max_freq_font_type,
+            avg_font_size,
+            (avg_dir_horizontal, avg_dir_vertical),
+            median_font_size,
+        )
+    def __make_new_block(self, input_block):
+        new_block = {}
+        raw_lines = input_block["lines"]
+        stats = self.__calc_stats_of_new_lines(raw_lines)
+        block_id = input_block["block_id"]
+        block_bbox = input_block["bbox"]
+        block_text = input_block["text"]
+        block_lines = raw_lines
+        block_avg_left_boundary = stats[0]
+        block_avg_right_boundary = stats[1]
+        block_avg_char_width = stats[2]
+        block_avg_char_height = stats[3]
+        block_font_type = stats[4]
+        block_font_size = stats[5]
+        block_direction = stats[6]
+        block_median_font_size = stats[7]
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["dir"] = block_direction
+        new_block["X0"] = block_avg_left_boundary
+        new_block["X1"] = block_avg_right_boundary
+        new_block["avg_char_width"] = block_avg_char_width
+        new_block["avg_char_height"] = block_avg_char_height
+        new_block["block_font_type"] = block_font_type
+        new_block["block_font_size"] = block_font_size
+        new_block["lines"] = block_lines
+        new_block["median_font_size"] = block_median_font_size
+        return new_block
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+            Schema can refer to the value of key ""preproc_blocks".
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self.__make_new_block(input_block)
+                        para_blocks.append(new_block)
+                blocks["para_blocks"] = para_blocks
+        return pdf_dic
+class DocStatisticsCalculator:
+    """
+    This class calculates the statistics of the document.
+    """
+    def __init__(self) -> None:
+        pass
+    def calc_stats_of_doc(self, pdf_dict):
+        """
+        This function computes the statistics of the document
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+        Returns
+        -------
+        statistics : dict
+            statistics of the document
+        """
+        total_text_length = 0
+        total_num_blocks = 0
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    for para_block in para_blocks:
+                        total_text_length += len(para_block["text"])
+                        total_num_blocks += 1
+        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
+        font_list = []
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        block_text_length = len(input_block.get("text", ""))
+                        if block_text_length < avg_text_length * 0.5:
+                            continue
+                        block_font_type = safe_get(input_block, "block_font_type", "")
+                        block_font_size = safe_get(input_block, "block_font_size", 0)
+                        font_list.append((block_font_type, block_font_size))
+        font_counter = Counter(font_list)
+        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
+        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
+        statistics = {
+            "num_pages": 0,
+            "num_blocks": 0,
+            "num_paras": 0,
+            "num_titles": 0,
+            "num_header_blocks": 0,
+            "num_footer_blocks": 0,
+            "num_watermark_blocks": 0,
+            "num_vertical_margin_note_blocks": 0,
+            "most_common_font_type": most_common_font[0][0],
+            "most_common_font_size": most_common_font[0][1],
+            "number_of_most_common_font": most_common_font[1],
+            "second_most_common_font_type": second_most_common_font[0][0],
+            "second_most_common_font_size": second_most_common_font[0][1],
+            "number_of_second_most_common_font": second_most_common_font[1],
+            "avg_text_length": avg_text_length,
+        }
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                blocks = pdf_dict[page_id]["para_blocks"]
+                statistics["num_pages"] += 1
+                for block_id, block_data in enumerate(blocks):
+                    statistics["num_blocks"] += 1
+                    if "paras" in block_data.keys():
+                        statistics["num_paras"] += len(block_data["paras"])
+                    for line in block_data["lines"]:
+                        if line.get("is_title", 0):
+                            statistics["num_titles"] += 1
+                    if block_data.get("is_header", 0):
+                        statistics["num_header_blocks"] += 1
+                    if block_data.get("is_footer", 0):
+                        statistics["num_footer_blocks"] += 1
+                    if block_data.get("is_watermark", 0):
+                        statistics["num_watermark_blocks"] += 1
+                    if block_data.get("is_vertical_margin_note", 0):
+                        statistics["num_vertical_margin_note_blocks"] += 1
+        pdf_dict["statistics"] = statistics
+        return pdf_dict
+class TitleProcessor:
+    """
+    This class processes the title.
+    """
+    def __init__(self, *doc_statistics) -> None:
+        if len(doc_statistics) > 0:
+            self.doc_statistics = doc_statistics[0]
+        self.nlp_model = NLPModels()
+        self.MAX_TITLE_LEVEL = 3
+        self.numbered_title_pattern = r"""
+            ^                                 # 行首
+            (                                 # 开始捕获组
+                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
+                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
+                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
+                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
+                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
+                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
+                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
+                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
+                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
+                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
+                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
+                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
+                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
+                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
+                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
+                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
+                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
+            )                                 # 结束捕获组
+            .+                                # 标题的其余部分
+        """
+        self.stage = (
+            0  # Used for distinguishing the stage of title detection, the number is occurred in paragraph process pipeline
+        )
+    def _is_potential_title(
+        self,
+        curr_line,
+        prev_line,
+        prev_line_is_title,
+        next_line,
+        avg_char_width,
+        avg_char_height,
+        median_font_size,
+    ):
+        """
+        This function checks if the line is a potential title.
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        avg_char_width : float
+            average of char widths
+        avg_char_height : float
+            average of line heights
+        Returns
+        -------
+        bool
+            True if the line is a potential title, False otherwise.
+        """
+        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
+            """
+            This function checks if the line is centered on the page
+            Parameters
+            ----------
+            line_bbox : list
+                bbox of the line
+            page_bbox : list
+                bbox of the page
+            avg_char_width : float
+                average of char widths
+            Returns
+            -------
+            bool
+                True if the line is centered on the page, False otherwise.
+            """
+            horizontal_ratio = 0.5
+            horizontal_thres = horizontal_ratio * avg_char_width
+            x0, _, x1, _ = line_bbox
+            _, _, page_x1, _ = page_bbox
+            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
+        def __is_bold_font_line(line):
+            """
+            Check if a line contains any bold font style.
+            """
+            def _is_bold_span(span):
+                # if span text is empty or only contains space, return False
+                if not span["text"].strip():
+                    return False
+                return bool(span["flags"] & 2**4)  # Check if the font is bold
+            for span in line["spans"]:
+                if not _is_bold_span(span):
+                    return False
+            return True
+        def __is_italic_font_line(line):
+            """
+            Check if a line contains any italic font style.
+            """
+            def __is_italic_span(span):
+                return bool(span["flags"] & 2**1)  # Check if the font is italic
+            for span in line["spans"]:
+                if not __is_italic_span(span):
+                    return False
+            return True
+        def __is_punctuation_heavy(line_text):
+            """
+            Check if the line contains a high ratio of punctuation marks, which may indicate
+            that the line is not a title.
+            Parameters:
+            line_text (str): Text of the line.
+            Returns:
+            bool: True if the line is heavy with punctuation, False otherwise.
+            """
+            # Pattern for common title format like "X.Y. Title"
+            pattern = r"\b\d+\.\d+\..*\b"
+            # If the line matches the title format, return False
+            if re.match(pattern, line_text.strip()):
+                return False
+            # Find all punctuation marks in the line
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)
+            number_of_punctuation_marks = len(punctuation_marks)
+            text_length = len(line_text)
+            if text_length == 0:
+                return False
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.1:
+                return True
+            return False
+        def __has_mixed_font_styles(spans, strict_mode=False):
+            """
+            This function checks if the line has mixed font styles, the strict mode will compare the font types
+            Parameters
+            ----------
+            spans : list
+                spans of the line
+            strict_mode : bool
+                True for strict mode, the font types will be fully compared
+                False for non-strict mode, the font types will be compared by the most longest common prefix
+            Returns
+            -------
+            bool
+                True if the line has mixed font styles, False otherwise.
+            """
+            if strict_mode:
+                font_styles = set()
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.add(font_style)
+                return len(font_styles) > 1
+            else:  # non-strict mode
+                font_styles = []
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.append(font_style)
+                if len(font_styles) > 1:
+                    longest_common_prefix = os.path.commonprefix(font_styles)
+                    if len(longest_common_prefix) > 0:
+                        return False
+                    else:
+                        return True
+                else:
+                    return False
+        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
+            """
+            This function checks if the current line has a different font type from the previous and next lines
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            prev_line_font_type : str
+                font type of the previous line
+            next_line_font_type : str
+                font type of the next line
+            Returns
+            -------
+            bool
+                True if the current line has a different font type from the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_type != other_font_type.lower()
+                for other_font_type in [prev_line_font_type, next_line_font_type]
+                if other_font_type is not None
+            )
+        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
+            """
+            This function checks if the current line has a larger font size than the previous and next lines
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            prev_line_font_size : float
+                font size of the previous line
+            next_line_font_size : float
+                font size of the next line
+            Returns
+            -------
+            bool
+                True if the current line has a larger font size than the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_size > other_font_size * 1.2
+                for other_font_size in [prev_line_font_size, next_line_font_size]
+                if other_font_size is not None
+            )
+        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
+            """
+            This function checks if the current line is similar to the previous line
+            Parameters
+            ----------
+            curr_line : dict
+                current line
+            prev_line : dict
+                previous line
+            Returns
+            -------
+            bool
+                True if the current line is similar to the previous line, False otherwise.
+            """
+            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
+                return True
+            else:
+                return False
+        def __is_same_font_type_of_docAvg(curr_line_font_type):
+            """
+            This function checks if the current line has the same font type as the document average font type
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            Returns
+            -------
+            bool
+                True if the current line has the same font type as the document average font type, False otherwise.
+            """
+            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
+            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
+            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
+        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
+            """
+            This function checks if the current line has a large enough font size
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            ratio : float
+                ratio of the current line font size to the document average font size
+            Returns
+            -------
+            bool
+                True if the current line has a large enough font size, False otherwise.
+            """
+            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
+            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
+            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
+            return curr_line_font_size >= doc_avg_font_size * ratio
+        def __is_sufficient_spacing_above_and_below(
+            curr_line_bbox,
+            prev_line_bbox,
+            next_line_bbox,
+            avg_char_height,
+            median_font_size,
+        ):
+            """
+            This function checks if the current line has sufficient spacing above and below
+            Parameters
+            ----------
+            curr_line_bbox : list
+                bbox of the current line
+            prev_line_bbox : list
+                bbox of the previous line
+            next_line_bbox : list
+                bbox of the next line
+            avg_char_width : float
+                average of char widths
+            avg_char_height : float
+                average of line heights
+            Returns
+            -------
+            bool
+                True if the current line has sufficient spacing above and below, False otherwise.
+            """
+            vertical_ratio = 1.25
+            vertical_thres = vertical_ratio * median_font_size
+            _, y0, _, y1 = curr_line_bbox
+            sufficient_spacing_above = False
+            if prev_line_bbox:
+                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
+                sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            else:
+                sufficient_spacing_above = True
+            sufficient_spacing_below = False
+            if next_line_bbox:
+                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
+                sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            else:
+                sufficient_spacing_below = True
+            return (sufficient_spacing_above, sufficient_spacing_below)
+        def __is_word_list_line_by_rules(curr_line_text):
+            """
+            This function checks if the current line is a word list
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
+            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
+            compiled_pattern = re.compile(name_list_pattern)
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+        def __get_text_catgr_by_nlp(curr_line_text):
+            """
+            This function checks if the current line is a name list using nlp model, such as spacy
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
+            return result
+        def __is_numbered_title(curr_line_text):
+            """
+            This function checks if the current line is a numbered list
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line is a numbered list, False otherwise.
+            """
+            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+        def __is_end_with_ending_puncs(line_text):
+            """
+            This function checks if the current line ends with a ending punctuation mark
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line ends with a punctuation mark, False otherwise.
+            """
+            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+            line_text = line_text.rstrip()
+            if line_text[-1] in end_puncs:
+                return True
+            return False
+        def __contains_only_no_meaning_symbols(line_text):
+            """
+            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
+            Situation contains:
+            1. Only have punctuation marks
+            2. Only have other non-meaning symbols
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+            Returns
+            -------
+            bool
+                True if the current line contains only symbols that have no meaning, False otherwise.
+            """
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
+            number_of_punctuation_marks = len(punctuation_marks)
+            text_length = len(line_text)
+            if text_length == 0:
+                return False
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.9:
+                return True
+            return False
+        def __is_equation(line_text):
+            """
+            This function checks if the current line is an equation.
+            Parameters
+            ----------
+            line_text : str
+            Returns
+            -------
+            bool
+                True if the current line is an equation, False otherwise.
+            """
+            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
+            if re.search(equation_reg, line_text):
+                return True
+            else:
+                return False
+        def __is_title_by_len(text, max_length=200):
+            """
+            This function checks if the current line is a title by length.
+            Parameters
+            ----------
+            text : str
+                text of the current line
+            max_length : int
+                max length of the title
+            Returns
+            -------
+            bool
+                True if the current line is a title, False otherwise.
+            """
+            text = text.strip()
+            return len(text) <= max_length
+        def __compute_line_font_type_and_size(curr_line):
+            """
+            This function computes the font type and font size of the line.
+            Parameters
+            ----------
+            line : dict
+                line
+            Returns
+            -------
+            font_type : str
+                font type of the line
+            font_size : float
+                font size of the line
+            """
+            spans = curr_line["spans"]
+            max_accumulated_length = 0
+            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
+            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
+            for span in spans:
+                if span["text"].isspace():
+                    continue
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_accumulated_length:
+                    max_accumulated_length = span_length
+                    max_span_font_size = span["size"]
+                    max_span_font_type = span["font"].lower()
+            return max_span_font_type, max_span_font_size
+        def __is_a_consistent_sub_title(pre_line, curr_line):
+            """
+            This function checks if the current line is a consistent sub title.
+            Parameters
+            ----------
+            pre_line : dict
+                previous line
+            curr_line : dict
+                current line
+            Returns
+            -------
+            bool
+                True if the current line is a consistent sub title, False otherwise.
+            """
+            if pre_line is None:
+                return False
+            start_letter_of_pre_line = pre_line["text"][0]
+            start_letter_of_curr_line = curr_line["text"][0]
+            has_same_prefix_digit = (
+                start_letter_of_pre_line.isdigit()
+                and start_letter_of_curr_line.isdigit()
+                and start_letter_of_pre_line == start_letter_of_curr_line
+            )
+            # prefix text of curr_line satisfies the following title format: x.x
+            prefix_text_pattern = r"^\d+\.\d+"
+            subtitle_format_match = re.match(prefix_text_pattern, curr_line["text"])
+            if subtitle_format_match:
+                has_subtitle_format = True
+            else:
+                has_subtitle_format = False
+            if has_same_prefix_digit or has_subtitle_format:
+                print("is a consistent sub title")
+                return True
+        """
+        Title detecting main Process.
+        """
+        """
+        Basic features about the current line.
+        """
+        curr_line_bbox = curr_line["bbox"]
+        curr_line_text = curr_line["text"]
+        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
+        if len(curr_line_text.strip()) == 0:  # skip empty lines
+            return False, False
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        if prev_line:
+            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
+        else:
+            prev_line_font_type, prev_line_font_size = None, None
+        next_line_bbox = next_line["bbox"] if next_line else None
+        if next_line:
+            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
+        else:
+            next_line_font_type, next_line_font_size = None, None
+        """
+        Aggregated features about the current line.
+        """
+        is_italc_font = __is_italic_font_line(curr_line)
+        is_bold_font = __is_bold_font_line(curr_line)
+        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
+        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
+        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
+        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
+        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
+        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
+        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
+        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
+        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
+        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
+        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
+            curr_line_font_size, prev_line_font_size, next_line_font_size
+        )
+        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
+            curr_line_font_type, prev_line_font_type, next_line_font_type
+        )
+        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
+        )
+        is_similar_to_pre_line = __is_similar_to_pre_line(
+            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
+        )
+        is_consistent_sub_title = __is_a_consistent_sub_title(prev_line, curr_line)
+        """
+        Further aggregated features about the current line.
+        Attention:
+            Features that start with __ are for internal use.
+        """
+        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
+        )
+        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
+        is_a_left_inline_title = (
+            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
+        )
+        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
+        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
+        is_title_by_check_pre_and_next_line = (
+            (prev_line is not None or next_line is not None)
+            and has_sufficient_spaces_above
+            and has_sufficient_spaces_below
+            and is_potential_title_font
+        )
+        is_numbered_title = __is_numbered_title(curr_line_text) and (
+            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
+        )
+        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
+        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
+        is_equation = __is_equation(curr_line_text)
+        is_title_by_len = __is_title_by_len(curr_line_text)
+        """
+        Decide if the line is a title.
+        """
+        is_title = (
+            is_not_end_with_ending_puncs  # not end with ending punctuation marks
+            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
+            and is_title_by_len  # is a title by length, default max length is 200
+            and not is_equation  # an interline equation should never be a title
+            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
+            and (
+                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                or (
+                    is_much_larger_font_than_doc_avg
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_font_size_little_less_than_doc_avg
+                    and is_bold_font
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+            )  # not the same font type as the document average font type, which includes the most common font type and the second most common font type
+            and (
+                (
+                    not is_person_or_org_list_line_by_nlp
+                    and (
+                        is_much_larger_font_than_doc_avg
+                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                    )
+                )
+                or (
+                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
+                    and not is_a_left_inline_title
+                    and not is_punctuation_heavy
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_person_or_org_list_line_by_nlp
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                )
+                or (is_numbered_title and not is_a_left_inline_title)
+            )
+            # )
+        ) or (prev_line_is_title and is_consistent_sub_title)
+        is_name_or_org_list_to_be_removed = (
+            (is_person_or_org_list_line_by_nlp)
+            and is_punctuation_heavy
+            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+        ) and not is_title
+        if is_name_or_org_list_to_be_removed:
+            is_author_or_org_list = True
+        else:
+            is_author_or_org_list = False
+        # return is_title, is_author_or_org_list
+        # """
+        """
+        # print reason why the line is a title
+        if is_title:
+            print_green("This line is a title.")
+            print_green("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+            print(f"prev_line_is_title: {prev_line_is_title}")
+            print()
+            print(f"is_consistent_sub_title: {is_consistent_sub_title}")
+        """
+        # print reason why the line is not a title
+        # line_text = curr_line_text.strip()
+        # test_text = "Career/Personal Life"
+        # text_content_condition = line_text == test_text
+        # if not is_title and text_content_condition: # Print specific line
+        """
+        if not is_title: # Print each line
+            print_red("This line is not a title.")
+            print_red("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+            if is_not_end_with_ending_puncs:
+                print_green(f"is_not_end_with_ending_puncs")
+            else:
+                print_red(f"is_end_with_ending_puncs")
+            if is_not_only_no_meaning_symbols:
+                print_green(f"is_not_only_no_meaning_symbols")
+            else:
+                print_red(f"is_only_no_meaning_symbols")
+            if is_title_by_len:
+                print_green(f"is_title_by_len: {is_title_by_len}")
+            else:
+                print_red(f"is_not_title_by_len: {is_title_by_len}")
+            if is_equation:
+                print_red(f"is_equation")
+            else:
+                print_green(f"is_not_equation")
+            if is_potential_title_font:
+                print_green(f"is_potential_title_font")
+            else:
+                print_red(f"is_not_potential_title_font")
+            if is_punctuation_heavy:
+                print_red("is_punctuation_heavy")
+            else:
+                print_green("is_not_punctuation_heavy")
+            if is_bold_font:
+                print_green(f"is_bold_font")
+            else:
+                print_red(f"is_not_bold_font")
+            if is_font_size_not_less_than_doc_avg:
+                print_green(f"is_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_larger_font_than_doc_avg")
+            if is_much_larger_font_than_doc_avg:
+                print_green(f"is_much_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_much_larger_font_than_doc_avg")
+            if is_not_same_font_type_of_docAvg:
+                print_green(f"is_not_same_font_type_of_docAvg")
+            else:
+                print_red(f"is_same_font_type_of_docAvg")
+            if is_word_list_line_by_rules:
+                print_red("is_word_list_line_by_rules")
+            else:
+                print_green("is_not_name_list_by_rules")
+            if is_person_or_org_list_line_by_nlp:
+                print_red("is_person_or_org_list_line_by_nlp")
+            else:
+                print_green("is_not_person_or_org_list_line_by_nlp")
+            if not is_numbered_title:
+                print_red("is_not_numbered_title")
+            else:
+                print_green("is_numbered_title")
+            if is_a_left_inline_title:
+                print_red("is_a_left_inline_title")
+            else:
+                print_green("is_not_a_left_inline_title")
+            if not is_title_by_check_prev_line:
+                print_red("is_not_title_by_check_prev_line")
+            else:
+                print_green("is_title_by_check_prev_line")
+            if not is_title_by_check_next_line:
+                print_red("is_not_title_by_check_next_line")
+            else:
+                print_green("is_title_by_check_next_line")
+            if not is_title_by_check_pre_and_next_line:
+                print_red("is_not_title_by_check_pre_and_next_line")
+            else:
+                print_green("is_title_by_check_pre_and_next_line")
+        # print_green("Common features:")
+        # print_green("↓" * 10)
+        # print(f"    curr_line_font_type: {curr_line_font_type}")
+        # print(f"    curr_line_font_size: {curr_line_font_size}")
+        # print()
+        """
+        # """
+        return is_title, is_author_or_org_list
+    def _detect_title(self, curr_block, pre_block):
+        """
+        Use the functions 'is_potential_title' to detect titles of each paragraph block.
+        If a line is a title, then the value of key 'is_title' of the line will be set to True.
+        """
+        raw_lines = curr_block["lines"]
+        blk_avg_char_width = curr_block["avg_char_width"]
+        blk_avg_char_height = curr_block["avg_char_height"]
+        blk_media_font_size = curr_block["median_font_size"]
+        if self.stage == 0:
+            is_prev_line_a_title = False
+            for i, curr_line in enumerate(raw_lines):
+                prev_line = raw_lines[i - 1] if i > 0 else None
+                next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
+                is_line_a_title, is_line_an_entities_list = self._is_potential_title(
+                    curr_line,
+                    prev_line,
+                    is_prev_line_a_title,
+                    next_line,
+                    blk_avg_char_width,
+                    blk_avg_char_height,
+                    blk_media_font_size,
+                )
+                if is_line_a_title:
+                    curr_line["is_title"] = is_line_a_title
+                    is_prev_line_a_title = True  # set the flag to True for the next line
+                else:
+                    curr_line["is_title"] = False
+                    is_prev_line_a_title = False  # set the flag to False for the next line
+                if is_line_an_entities_list:
+                    curr_line["is_author_or_org_list"] = is_line_an_entities_list
+                else:
+                    curr_line["is_author_or_org_list"] = False
+            return curr_block
+        if self.stage == 1:  # Check the block consistent titles.
+            if pre_block and "paras" in pre_block.keys():
+                print_red(f"Checking cross block title...")
+                last_para_content = None
+                paras_of_pre_block = pre_block["paras"]
+                last_key = sorted(paras_of_pre_block.keys())[-1]
+                last_para_content = paras_of_pre_block[last_key]
+                if last_para_content is not None:
+                    last_line_of_last_para_of_last_block = pre_block["lines"][-1]
+                    first_line_of_curr_block = raw_lines[0]
+                    next_line_of_curr_block = raw_lines[1] if len(raw_lines) > 1 else None
+                    is_line_a_title, is_line_an_entities_list = self._is_potential_title(
+                        first_line_of_curr_block,
+                        last_line_of_last_para_of_last_block,
+                        last_line_of_last_para_of_last_block["is_title"],
+                        next_line_of_curr_block,
+                        blk_avg_char_width,
+                        blk_avg_char_height,
+                        blk_media_font_size,
+                    )
+                    if is_line_a_title:
+                        first_line_of_curr_block["is_title"] = is_line_a_title
+                    else:
+                        first_line_of_curr_block["is_title"] = False
+                    if is_line_an_entities_list:
+                        first_line_of_curr_block["is_author_or_org_list"] = is_line_an_entities_list
+                    else:
+                        first_line_of_curr_block["is_author_or_org_list"] = False
+                    # print(f"first_line_of_curr_block: {first_line_of_curr_block['text']}")
+                    # print(f"last_line_of_pre_block: {last_line_of_last_para['text']}")
+                    return curr_block
+                else:
+                    print_red(f"last_para_content is None")
+    def batch_detect_titles(self, pdf_dic):
+        """
+        This function batch process the blocks to detect titles.
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        num_titles = 0
+        for page_id, page_content in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page_content.keys():
+                    para_blocks = page_content["para_blocks"]
+                    all_single_line_blocks = []
+                    for block in para_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+                    new_para_blocks = []
+                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
+                        for para_idx, para_block in enumerate(para_blocks):
+                            print(f"______________________________________________________")
+                            print(f"page_id: {page_id}")
+                            print(f"para_block id: {para_block['block_id']}")
+                            print(f"para_idx: {para_idx}")
+                            pre_block = para_blocks[para_idx - 1] if para_idx > 0 else None
+                            curr_block = para_block
+                            print_yellow(f"text of current block: {curr_block['text'] if curr_block else None}")
+                            print_green(f"text of previous block: {pre_block['text'] if pre_block else None}")
+                            new_block = self._detect_title(curr_block, pre_block)
+                            new_para_blocks.append(new_block)
+                            # num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                            if new_block is not None:
+                                num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                            else:
+                                num_titles += 0
+                    else:  # All blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_para_blocks.append(para_block)
+                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
+                    para_blocks = new_para_blocks
+                page_content["para_blocks"] = para_blocks
+                for para_block in para_blocks:
+                    if para_block is not None:
+                        all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
+                        para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
+                        if (
+                            all_titles and para_text_len < 200
+                        ):  # total length of the paragraph is less than 200, more than this should not be a title
+                            para_block["is_block_title"] = 1
+                        else:
+                            para_block["is_block_title"] = 0
+                        all_name_or_org_list_to_be_removed = all(
+                            safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
+                        )
+                        if all_name_or_org_list_to_be_removed and page_id == "page_0":
+                            para_block["is_block_an_author_or_org_list"] = 1
+                        else:
+                            para_block["is_block_an_author_or_org_list"] = 0
+                    else:
+                        all_titles = False
+                        # para_block["is_block_title"] = 0
+                        # para_block["is_block_an_author_or_org_list"] = 0
+                # page_content["para_blocks"] = para_blocks
+        pdf_dic["statistics"]["num_titles"] = num_titles
+        return pdf_dic
+    def _recog_title_level(self, title_blocks):
+        """
+        This function determines the title level based on the font size of the title.
+        Parameters
+        ----------
+        title_blocks : list
+        Returns
+        -------
+        title_blocks : list
+        """
+        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
+        # Use the mean and std of font sizes to remove extreme values
+        mean_font_size = np.mean(font_sizes)
+        std_font_size = np.std(font_sizes)
+        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
+        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
+        # Compute the threshold for title level
+        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
+        if middle_font_sizes.size > 0:
+            middle_mean_font_size = np.mean(middle_font_sizes)
+            level_threshold = middle_mean_font_size
+        else:
+            level_threshold = mean_font_size
+        for tb in title_blocks:
+            title_block = tb["block"]
+            title_font_size = safe_get(title_block, "block_font_size", 0)
+            current_level = 1  # Initialize title level, the biggest level is 1
+            # print(f"Before adjustment by font size, {current_level}")
+            if title_font_size >= max_extreme_font_size:
+                current_level = 1
+            elif title_font_size <= min_extreme_font_size:
+                current_level = 3
+            elif float(title_font_size) >= float(level_threshold):
+                current_level = 2
+            else:
+                current_level = 3
+            # print(f"After adjustment by font size, {current_level}")
+            title_block["block_title_level"] = current_level
+        return title_blocks
+    def batch_recog_title_level(self, pdf_dic):
+        """
+        This function batch process the blocks to recognize title level.
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        title_blocks = []
+        # Collect all titles
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = blocks.get("para_blocks", [])
+                for block in para_blocks:
+                    if block.get("is_block_title"):
+                        title_obj = {"page_id": page_id, "block": block}
+                        title_blocks.append(title_obj)
+        # Determine title level
+        if title_blocks:
+            # Determine title level based on font size
+            title_blocks = self._recog_title_level(title_blocks)
+        return pdf_dic
+class BlockTerminationProcessor:
+    """
+    This class is used to process the block termination.
+    """
+    def __init__(self) -> None:
+        pass
+    def _is_consistent_lines(
+        self,
+        curr_line,
+        prev_line,
+        next_line,
+        consistent_direction,  # 0 for prev, 1 for next, 2 for both
+    ):
+        """
+        This function checks if the line is consistent with its neighbors
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        consistent_direction : int
+            0 for prev, 1 for next, 2 for both
+        Returns
+        -------
+        bool
+            True if the line is consistent with its neighbors, False otherwise.
+        """
+        curr_line_font_size = curr_line["spans"][0]["size"]
+        curr_line_font_type = curr_line["spans"][0]["font"].lower()
+        if consistent_direction == 0:
+            if prev_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
+            else:
+                return False
+        elif consistent_direction == 1:
+            if next_line:
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+            else:
+                return False
+        elif consistent_direction == 2:
+            if prev_line and next_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
+                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+                )
+            else:
+                return False
+        else:
+            return False
+    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
+        """
+        This function checks if the line is a regular line
+        Parameters
+        ----------
+        curr_line_bbox : list
+            bbox of the current line
+        prev_line_bbox : list
+            bbox of the previous line
+        next_line_bbox : list
+            bbox of the next line
+        avg_char_width : float
+            average of char widths
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_line_height : float
+            average of line heights
+        Returns
+        -------
+        bool
+            True if the line is a regular line, False otherwise.
+        """
+        horizontal_ratio = 0.5
+        vertical_ratio = 0.5
+        horizontal_thres = horizontal_ratio * avg_char_width
+        vertical_thres = vertical_ratio * avg_line_height
+        x0, y0, x1, y1 = curr_line_bbox
+        x0_near_X0 = abs(x0 - X0) < horizontal_thres
+        x1_near_X1 = abs(x1 - X1) < horizontal_thres
+        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+        return (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (not x0_near_X0 and not x1_near_X1)
+            or prev_line_is_end_of_para
+        )
+    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
+        """
+        This function checks if the line is a possible start of a paragraph
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        avg_line_height : float
+            average of line heights
+        Returns
+        -------
+        bool
+            True if the line is a possible start of a paragraph, False otherwise.
+        """
+        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
+        decision_path = []  # Record the decision path
+        curr_line_bbox = curr_line["bbox"]
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        next_line_bbox = next_line["bbox"] if next_line else None
+        indent_ratio = 1
+        vertical_ratio = 1.5
+        vertical_thres = vertical_ratio * avg_font_size
+        left_horizontal_ratio = 0.5
+        left_horizontal_thres = left_horizontal_ratio * avg_char_width
+        right_horizontal_ratio = 2.5
+        right_horizontal_thres = right_horizontal_ratio * avg_char_width
+        x0, y0, x1, y1 = curr_line_bbox
+        indent_condition = x0 > X0 + indent_ratio * avg_char_width
+        if indent_condition:
+            start_confidence += 0.2
+            decision_path.append("indent_condition_met")
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
+        if x0_near_X0:
+            start_confidence += 0.1
+            decision_path.append("x0_near_X0")
+        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
+        if x1_near_X1:
+            start_confidence += 0.1
+            decision_path.append("x1_near_X1")
+        if prev_line is None:
+            prev_line_is_end_of_para = True
+            start_confidence += 0.2
+            decision_path.append("no_prev_line")
+        else:
+            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
+            if prev_line_is_end_of_para:
+                start_confidence += 0.1
+                decision_path.append("prev_line_is_end_of_para")
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            if sufficient_spacing_above:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_above")
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            if sufficient_spacing_below:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_below")
+        is_regular_line = self._is_regular_line(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
+        )
+        if is_regular_line:
+            start_confidence += 0.1
+            decision_path.append("is_regular_line")
+        is_start_of_para = (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (indent_condition)
+            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
+            or prev_line_is_end_of_para
+        )
+        return (is_start_of_para, start_confidence, decision_path)
+    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
+        """
+        This function checks if the line is a possible end of a paragraph
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        Returns
+        -------
+        bool
+            True if the line is a possible end of a paragraph, False otherwise.
+        """
+        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
+        decision_path = []  # Record the decision path
+        curr_line_bbox = curr_line["bbox"]
+        next_line_bbox = next_line["bbox"] if next_line else None
+        left_horizontal_ratio = 0.5
+        right_horizontal_ratio = 0.5
+        x0, _, x1, y1 = curr_line_bbox
+        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
+        if x0_near_X0:
+            end_confidence += 0.1
+            decision_path.append("x0_near_X0")
+        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
+        if x1_smaller_than_X1:
+            end_confidence += 0.1
+            decision_path.append("x1_smaller_than_X1")
+        next_line_is_start_of_para = (
+            next_line_bbox
+            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
+            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
+        )
+        if next_line_is_start_of_para:
+            end_confidence += 0.2
+            decision_path.append("next_line_is_start_of_para")
+        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if is_line_left_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_left_aligned_from_neighbors")
+        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if not is_line_right_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_not_right_aligned_from_neighbors")
+        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
+            (x0_near_X0 and x1_smaller_than_X1)
+            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
+        )
+        return (is_end_of_para, end_confidence, decision_path)
+    def _cut_paras_per_block(
+        self,
+        block,
+    ):
+        """
+        Processes a raw block from PyMuPDF and returns the processed block.
+        Parameters
+        ----------
+        raw_block : dict
+            A raw block from pymupdf.
+        Returns
+        -------
+        processed_block : dict
+        """
+        def _construct_para(lines, is_block_title, para_title_level):
+            """
+            Construct a paragraph from given lines.
+            """
+            font_sizes = [span["size"] for line in lines for span in line["spans"]]
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
+            font_colors = [span["color"] for line in lines for span in line["spans"]]
+            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
+            font_type_lengths = {}
+            for line in lines:
+                for span in line["spans"]:
+                    font_type = span["font"]
+                    bbox_width = span["bbox"][2] - span["bbox"][0]
+                    if font_type in font_type_lengths:
+                        font_type_lengths[font_type] += bbox_width
+                    else:
+                        font_type_lengths[font_type] = bbox_width
+            # get the font type with the longest bbox width
+            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
+            para_bbox = calculate_para_bbox(lines)
+            para_text = " ".join(line["text"] for line in lines)
+            return {
+                "para_bbox": para_bbox,
+                "para_text": para_text,
+                "para_font_type": most_common_font_type,
+                "para_font_size": avg_font_size,
+                "para_font_color": most_common_font_color,
+                "is_para_title": is_block_title,
+                "para_title_level": para_title_level,
+            }
+        block_bbox = block["bbox"]
+        block_text = block["text"]
+        block_lines = block["lines"]
+        X0 = safe_get(block, "X0", 0)
+        X1 = safe_get(block, "X1", 0)
+        avg_char_width = safe_get(block, "avg_char_width", 0)
+        avg_char_height = safe_get(block, "avg_char_height", 0)
+        avg_font_size = safe_get(block, "avg_font_size", 0)
+        is_block_title = safe_get(block, "is_block_title", False)
+        para_title_level = safe_get(block, "block_title_level", 0)
+        # Segment into paragraphs
+        para_ranges = []
+        in_paragraph = False
+        start_idx_of_para = None
+        # Create the processed paragraphs
+        processed_paras = {}
+        para_bboxes = []
+        end_idx_of_para = 0
+        for line_index, line in enumerate(block_lines):
+            curr_line = line
+            prev_line = block_lines[line_index - 1] if line_index > 0 else None
+            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
+            """
+            Start processing paragraphs.
+            """
+            # Check if the line is the start of a paragraph
+            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
+                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
+            )
+            if not in_paragraph and is_start_of_para:
+                in_paragraph = True
+                start_idx_of_para = line_index
+                # print_green(">>> Start of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    start_confidence: ", start_confidence)
+                # print("    decision_path: ", decision_path)
+            # Check if the line is the end of a paragraph
+            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
+                curr_line, next_line, X0, X1, avg_char_width
+            )
+            if in_paragraph and (is_end_of_para or not next_line):
+                para_ranges.append((start_idx_of_para, line_index))
+                start_idx_of_para = None
+                in_paragraph = False
+                # print_red(">>> End of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    end_confidence: ", end_confidence)
+                # print("    decision_path: ", decision_path)
+        # Add the last paragraph if it is not added
+        if in_paragraph and start_idx_of_para is not None:
+            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
+        # Process the matched paragraphs
+        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
+            matched_lines = block_lines[start_idx : end_idx + 1]
+            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
+            para_key = f"para_{len(processed_paras)}"
+            processed_paras[para_key] = para_properties
+            para_bboxes.append(para_properties["para_bbox"])
+            end_idx_of_para = end_idx + 1
+        # Deal with the remaining lines
+        if end_idx_of_para < len(block_lines):
+            unmatched_lines = block_lines[end_idx_of_para:]
+            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
+            unmatched_key = f"para_{len(processed_paras)}"
+            processed_paras[unmatched_key] = unmatched_properties
+            para_bboxes.append(unmatched_properties["para_bbox"])
+        block["paras"] = processed_paras
+        return block
+    def batch_process_blocks(self, pdf_dict):
+        """
+        Parses the blocks of all pages.
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        filter_blocks : list
+            List of bounding boxes to filter.
+        Returns
+        -------
+        result_dict : dict
+            Result dictionary.
+        """
+        num_paras = 0
+        for page_id, page in pdf_dict.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page.keys():
+                    input_blocks = page["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self._cut_paras_per_block(input_block)
+                        para_blocks.append(new_block)
+                        num_paras += len(new_block["paras"])
+                page["para_blocks"] = para_blocks
+        pdf_dict["statistics"]["num_paras"] = num_paras
+        return pdf_dict
+class BlockContinuationProcessor:
+    """
+    This class is used to process the blocks to detect block continuations.
+    """
+    def __init__(self) -> None:
+        pass
+    def __is_similar_font_type(self, font_type_1, font_type_2, prefix_length_ratio=0.3):
+        """
+        This function checks if the two font types are similar.
+        Definition of similar font types: the two font types have a common prefix,
+        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
+        Parameters
+        ----------
+        font_type1 : str
+            font type 1
+        font_type2 : str
+            font type 2
+        prefix_length_ratio : float
+            minimum ratio of the common prefix length to the length of the shorter font type
+        Returns
+        -------
+        bool
+            True if the two font types are similar, False otherwise.
+        """
+        if isinstance(font_type_1, list):
+            font_type_1 = font_type_1[0] if font_type_1 else ""
+        if isinstance(font_type_2, list):
+            font_type_2 = font_type_2[0] if font_type_2 else ""
+        if font_type_1 == font_type_2:
+            return True
+        # Find the length of the common prefix
+        common_prefix_length = len(os.path.commonprefix([font_type_1, font_type_2]))
+        # Calculate the minimum prefix length based on the ratio
+        min_prefix_length = int(min(len(font_type_1), len(font_type_2)) * prefix_length_ratio)
+        return common_prefix_length >= min_prefix_length
+    def __is_same_block_font(self, block_1, block_2):
+        """
+        This function compares the font of block1 and block2
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 have the same font, else False
+        """
+        block_1_font_type = safe_get(block_1, "block_font_type", "")
+        block_1_font_size = safe_get(block_1, "block_font_size", 0)
+        block_1_avg_char_width = safe_get(block_1, "avg_char_width", 0)
+        block_2_font_type = safe_get(block_2, "block_font_type", "")
+        block_2_font_size = safe_get(block_2, "block_font_size", 0)
+        block_2_avg_char_width = safe_get(block_2, "avg_char_width", 0)
+        if isinstance(block_1_font_size, list):
+            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
+        if isinstance(block_2_font_size, list):
+            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
+        block_1_text = safe_get(block_1, "text", "")
+        block_2_text = safe_get(block_2, "text", "")
+        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
+            return False
+        if not block_1_text or not block_2_text:
+            return False
+        else:
+            text_len_ratio = len(block_2_text) / len(block_1_text)
+            if text_len_ratio < 0.2:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.5
+                )
+            else:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.2
+                )
+        block_font_size_condition = abs(block_1_font_size - block_2_font_size) < 1
+        return (
+            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
+            and avg_char_width_condition
+            and block_font_size_condition
+        )
+    def _is_alphabet_char(self, char):
+        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
+            return True
+        else:
+            return False
+    def _is_chinese_char(self, char):
+        if char >= "\u4e00" and char <= "\u9fa5":
+            return True
+        else:
+            return False
+    def _is_other_letter_char(self, char):
+        try:
+            cat = unicodedata.category(char)
+            if cat == "Lu" or cat == "Ll":
+                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
+        except TypeError:
+            print("The input to the function must be a single character.")
+        return False
+    def _is_year(self, s: str):
+        try:
+            number = int(s)
+            return 1900 <= number <= 2099
+        except ValueError:
+            return False
+    def _match_brackets(self, text):
+        # pattern = r"^[\(\)\[\]（）【】{}｛｝<>＜＞〔〕〘〙\"\'“”‘’]"
+        pattern = r"^[\(\)\]（）】{}｛｝>＞〕〙\"\'“”‘’]"
+        return bool(re.match(pattern, text))
+    def _is_para_font_consistent(self, para_1, para_2):
+        """
+        This function compares the font of para1 and para2
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 have the same font, else False
+        """
+        if para_1 is None or para_2 is None:
+            return False
+        para_1_font_type = safe_get(para_1, "para_font_type", "")
+        para_1_font_size = safe_get(para_1, "para_font_size", 0)
+        para_1_font_color = safe_get(para_1, "para_font_color", "")
+        para_2_font_type = safe_get(para_2, "para_font_type", "")
+        para_2_font_size = safe_get(para_2, "para_font_size", 0)
+        para_2_font_color = safe_get(para_2, "para_font_color", "")
+        if isinstance(para_1_font_type, list):  # get the most common font type
+            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
+        if isinstance(para_2_font_type, list):
+            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
+        if isinstance(para_1_font_size, list):  # compute average font type
+            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
+        if isinstance(para_2_font_size, list):  # compute average font type
+            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
+        return (
+            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
+            and abs(para_1_font_size - para_2_font_size) < 1.5
+            # and para_font_color1 == para_font_color2
+        )
+    def _is_para_puncs_consistent(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph by using the puncs, else False
+        """
+        para_1_text = safe_get(para_1, "para_text", "").strip()
+        para_2_text = safe_get(para_2, "para_text", "").strip()
+        para_1_bboxes = safe_get(para_1, "para_bbox", [])
+        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
+        para_2_bboxes = safe_get(para_2, "para_bbox", [])
+        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
+        # print_yellow("    Features of determine puncs_consistent:")
+        # print(f"    para_1_text: {para_1_text}")
+        # print(f"    para_2_text: {para_2_text}")
+        # print(f"    para_1_bboxes: {para_1_bboxes}")
+        # print(f"    para_2_bboxes: {para_2_bboxes}")
+        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
+        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
+        if is_nested_list(para_1_bboxes):
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
+        else:
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
+        if is_nested_list(para_2_bboxes):
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
+            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
+        else:
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
+        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
+        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
+        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
+        # Check if either para_text1 or para_text2 is empty
+        if not para_1_text or not para_2_text:
+            return False
+        # Define the end puncs for a sentence to end and hyphen
+        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+        hyphen = ["-", "—"]
+        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
+        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
+        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
+        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
+        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
+        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
+        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
+        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
+        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
+        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
+            # print_red(f"para_1 is end with hyphen.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] in hyphen
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
+            # print_red(f"para_1 is end with end_punc.")
+            para_2_is_consistent = (
+                para_2_text
+                and (
+                    para_2_text[0]
+                    == " "
+                    # or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
+                    # or (self._is_chinese_char(para_2_text[0]))
+                    # or (self._is_other_letter_char(para_2_text[0]))
+                )
+                and not is_para2_left_indent_than_papa1
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
+            # print_red(f"para_1 is NOT end with end_punc.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_alphabet_char(para_2_text[0]))
+                or (self._is_year(para_2_text[0:4]))
+                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+                or (self._match_brackets(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+        elif para_1_end_with_space:  # If para_text1 ends with space
+            # print_red(f"para_1 is end with space.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                pass
+                # print(f"para_2 is not consistent.\n")
+        return False
+    def _is_block_consistent(self, block_1, block_2):
+        """
+        This function determines whether block1 and block2 are originally from the same block
+        Parameters
+        ----------
+        block1 : dict
+            block1s
+        block2 : dict
+            block2
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 are from the same block, else False
+        """
+        return self.__is_same_block_font(block_1, block_2)
+    def _is_para_continued(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph, else False
+        """
+        is_para_font_consistent = self._is_para_font_consistent(para_1, para_2)
+        is_para_puncs_consistent = self._is_para_puncs_consistent(para_1, para_2)
+        return is_para_font_consistent and is_para_puncs_consistent
+    def _are_boundaries_of_block_consistent(self, block_1, block_2):
+        """
+        This function checks if the boundaries of block1 and block2 are consistent
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+        Returns
+        -------
+        is_consistent : bool
+            True if the boundaries of block1 and block2 are consistent, else False
+        """
+        last_line_of_block_1 = block_1["lines"][-1]
+        first_line_of_block_2 = block_2["lines"][0]
+        spans_of_last_line_of_block_1 = last_line_of_block_1["spans"]
+        spans_of_first_line_of_block_2 = first_line_of_block_2["spans"]
+        font_type_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["font"].lower()
+        font_size_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["size"]
+        font_color_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["color"]
+        font_flags_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["flags"]
+        font_type_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["font"].lower()
+        font_size_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["size"]
+        font_color_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["color"]
+        font_flags_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["flags"]
+        return (
+            self.__is_similar_font_type(font_type_of_last_line_of_block_1, font_type_of_first_line_of_block_2)
+            and abs(font_size_of_last_line_of_block_1 - font_size_of_first_line_of_block_2) < 1
+            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
+            and font_flags_of_last_line_of_block_1 == font_flags_of_first_line_of_block_2
+        )
+    def should_merge_next_para(self, curr_para, next_para):
+        """
+        This function checks if the next_para should be merged into the curr_para.
+        Parameters
+        ----------
+        curr_para : dict
+            The current paragraph.
+        next_para : dict
+            The next paragraph.
+        Returns
+        -------
+        bool
+            True if the next_para should be merged into the curr_para, False otherwise.
+        """
+        if self._is_para_continued(curr_para, next_para):
+            return True
+        else:
+            return False
+    def batch_tag_paras(self, pdf_dict):
+        """
+        This function tags the paragraphs in the pdf_dict.
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with tagged paragraphs.
+        """
+        the_last_page_id = len(pdf_dict) - 1
+        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
+            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
+                para_blocks_of_curr_page = curr_page_content["para_blocks"]
+                next_page_idx = curr_page_idx + 1
+                next_page_id = f"page_{next_page_idx}"
+                next_page_content = pdf_dict.get(next_page_id, {})
+                for i, current_block in enumerate(para_blocks_of_curr_page):
+                    for para_id, curr_para in current_block["paras"].items():
+                        curr_para["curr_para_location"] = [
+                            curr_page_idx,
+                            current_block["block_id"],
+                            int(para_id.split("_")[-1]),
+                        ]
+                        curr_para["next_para_location"] = None  # 默认设置为None
+                        curr_para["merge_next_para"] = False  # 默认设置为False
+                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
+                    if next_block:
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+                        next_block_first_para_key = list(next_block["paras"].keys())[0]
+                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
+                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                            curr_blk_last_para["next_para_location"] = [
+                                curr_page_idx,
+                                next_block["block_id"],
+                                int(next_block_first_para_key.split("_")[-1]),
+                            ]
+                            curr_blk_last_para["merge_next_para"] = True
+                    else:
+                        # Handle the case where the next block is in a different page
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
+                            next_page_idx += 1
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id, {})
+                        if next_page_content.get("para_blocks", []):
+                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
+                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
+                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                                curr_blk_last_para["next_para_location"] = [
+                                    next_page_idx,
+                                    next_page_content["para_blocks"][0]["block_id"],
+                                    int(next_blk_first_para_key.split("_")[-1]),
+                                ]
+                                curr_blk_last_para["merge_next_para"] = True
+        return pdf_dict
+    def find_block_by_id(self, para_blocks, block_id):
+        """
+        This function finds a block by its id.
+        Parameters
+        ----------
+        para_blocks : list
+            List of blocks.
+        block_id : int
+            Id of the block to find.
+        Returns
+        -------
+        block : dict
+            The block with the given id.
+        """
+        for block in para_blocks:
+            if block.get("block_id") == block_id:
+                return block
+        return None
+    def batch_merge_paras(self, pdf_dict):
+        """
+        This function merges the paragraphs in the pdf_dict.
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with merged paragraphs.
+        """
+        for page_id, page_content in pdf_dict.items():
+            if page_id.startswith("page_") and page_content.get("para_blocks", []):
+                para_blocks_of_page = page_content["para_blocks"]
+                for i in range(len(para_blocks_of_page)):
+                    current_block = para_blocks_of_page[i]
+                    paras = current_block["paras"]
+                    for para_id, curr_para in list(paras.items()):
+                        # 跳过标题段落
+                        if curr_para.get("is_para_title"):
+                            continue
+                        while curr_para.get("merge_next_para"):
+                            next_para_location = curr_para.get("next_para_location")
+                            if not next_para_location:
+                                break
+                            next_page_idx, next_block_id, next_para_id = next_para_location
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id)
+                            if not next_page_content:
+                                break
+                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
+                            if not next_block:
+                                break
+                            next_para = next_block["paras"].get(f"para_{next_para_id}")
+                            if not next_para or next_para.get("is_para_title"):
+                                break
+                            # 合并段落文本
+                            curr_para_text = curr_para.get("para_text", "")
+                            next_para_text = next_para.get("para_text", "")
+                            curr_para["para_text"] = curr_para_text + " " + next_para_text
+                            # 更新 next_para_location
+                            curr_para["next_para_location"] = next_para.get("next_para_location")
+                            # 将下一个段落文本置为空，表示已被合并
+                            next_para["para_text"] = ""
+                            # 更新 merge_next_para 标记
+                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
+        return pdf_dict
+class DrawAnnos:
+    """
+    This class draws annotations on the pdf file
+    ----------------------------------------
+                Color Code
+    ----------------------------------------
+        Red: (1, 0, 0)
+        Green: (0, 1, 0)
+        Blue: (0, 0, 1)
+        Yellow: (1, 1, 0) - mix of red and green
+        Cyan: (0, 1, 1) - mix of green and blue
+        Magenta: (1, 0, 1) - mix of red and blue
+        White: (1, 1, 1) - red, green and blue full intensity
+        Black: (0, 0, 0) - no color component whatsoever
+        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
+        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
+    """
+    def __init__(self) -> None:
+        pass
+    def __is_nested_list(self, lst):
+        """
+        This function returns True if the given list is a nested list of any degree.
+        """
+        if isinstance(lst, list):
+            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
+        return False
+    def __valid_rect(self, bbox):
+        # Ensure that the rectangle is not empty or invalid
+        if isinstance(bbox[0], list):
+            return False  # It's a nested list, hence it can't be valid rect
+        else:
+            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
+    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
+        """
+        This function draws the nested boxes
+        Parameters
+        ----------
+        page : fitz.Page
+            page
+        nested_bbox : list
+            nested bbox
+        color : tuple
+            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
+        """
+        if self.__is_nested_list(nested_bbox):  # If it's a nested list
+            for bbox in nested_bbox:
+                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
+        elif self.__valid_rect(nested_bbox):  # If valid rectangle
+            para_rect = fitz.Rect(nested_bbox)
+            para_anno = page.add_rect_annot(para_rect)
+            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
+            para_anno.set_border(width=1)
+            para_anno.update()
+    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
+        """
+        This function draws annotations on the pdf file.
+        Parameters
+        ----------
+        input_pdf_path : str
+            path to the input pdf file
+        pdf_dic : dict
+            pdf dictionary
+        output_pdf_path : str
+            path to the output pdf file
+        pdf_dic : dict
+            pdf dictionary
+        """
+        pdf_doc = open_pdf(input_pdf_path)
+        if pdf_dic is None:
+            pdf_dic = {}
+        if output_pdf_path is None:
+            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
+        for page_id, page in enumerate(pdf_doc):  # type: ignore
+            page_key = f"page_{page_id}"
+            for ele_key, ele_data in pdf_dic[page_key].items():
+                if ele_key == "para_blocks":
+                    para_blocks = ele_data
+                    for para_block in para_blocks:
+                        if "paras" in para_block.keys():
+                            paras = para_block["paras"]
+                            for para_key, para_content in paras.items():
+                                para_bbox = para_content["para_bbox"]
+                                # print(f"para_bbox: {para_bbox}")
+                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
+                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
+                                    color = (0, 1, 1)
+                                    self.__draw_nested_boxes(
+                                        page, para_bbox, color
+                                    )  # draw with cyan color for combined paragraph
+                                else:
+                                    if self.__valid_rect(para_bbox):
+                                        para_rect = fitz.Rect(para_bbox)
+                                        para_anno = page.add_rect_annot(para_rect)
+                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
+                                        para_anno.set_border(width=0.5)
+                                        para_anno.update()
+                                is_para_title = para_content["is_para_title"]
+                                if is_para_title:
+                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
+                                        color = (0, 0, 1)
+                                        self.__draw_nested_boxes(
+                                            page, para_content["para_bbox"], color
+                                        )  # draw with cyan color for combined title
+                                    else:
+                                        if self.__valid_rect(para_content["para_bbox"]):
+                                            para_rect = fitz.Rect(para_content["para_bbox"])
+                                            if self.__valid_rect(para_content["para_bbox"]):
+                                                para_anno = page.add_rect_annot(para_rect)
+                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
+                                                para_anno.set_border(width=0.5)
+                                                para_anno.update()
+        pdf_doc.save(output_pdf_path)
+        pdf_doc.close()
+class ParaProcessPipeline:
+    def __init__(self) -> None:
+        pass
+    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
+        """
+        This function processes the paragraphs, including:
+        1. Read raw input json file into pdf_dic
+        2. Detect and replace equations
+        3. Combine spans into a natural line
+        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        5. Compute statistics for each block
+        6. Detect titles in the document
+        7. Detect paragraphs inside each block
+        8. Divide the level of the titles
+        9. Detect and combine paragraphs from different blocks into one paragraph
+        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
+        11. Draw annotations on the pdf file
+        Parameters
+        ----------
+        pdf_dic_json_fpath : str
+            path to the pdf dictionary json file.
+            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
+        input_pdf_doc : str
+            path to the input pdf file
+        output_pdf_path : str
+            path to the output pdf file
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        error_info = None
+        output_json_file = ""
+        output_dir = ""
+        if input_pdf_path is not None:
+            input_pdf_path = os.path.abspath(input_pdf_path)
+            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
+        if output_pdf_path is not None:
+            output_dir = os.path.dirname(output_pdf_path)
+            output_json_file = f"{output_dir}/pdf_dic.json"
+        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
+            """
+            Save the pdf_dic to a json file
+            """
+            output_pdf_file_name = os.path.basename(output_pdf_path)
+            # output_dir = os.path.dirname(output_pdf_path)
+            output_dir = "\\tmp\\pdf_parse"
+            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
+            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            if para_debug_mode == "full":
+                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
+                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
+            # Validate the output already exists
+            if not os.path.exists(pdf_dic_json_fpath):
+                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
+                return None
+            else:
+                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
+            return pdf_dic_json_fpath
+        """
+        Preprocess the lines of block
+        """
+        # Combine spans into a natural line
+        rawBlockProcessor = RawBlockProcessor()
+        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        layoutFilter = LayoutFilterProcessor()
+        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
+        # Compute statistics for each block
+        blockStatisticsCalculator = BlockStatisticsCalculator()
+        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+        # Compute statistics for all blocks(namely this pdf document)
+        docStatisticsCalculator = DocStatisticsCalculator()
+        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
+        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
+        # Dump the first three stages of pdf_dic to a json file
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
+        """
+        Detect titles in the document
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        titleProcessor.stage = 0
+        pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
+        """
+        Detect and divide the level of the titles
+        """
+        titleProcessor = TitleProcessor()
+        pdf_dic = titleProcessor.batch_recog_title_level(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
+        """
+        Detect and split paragraphs inside each block
+        """
+        blockInnerParasProcessor = BlockTerminationProcessor()
+        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+        """
+        Detect and combine paragraphs from different blocks into one paragraph
+        """
+        blockContinuationProcessor = BlockContinuationProcessor()
+        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
+        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
+        """
+        Detect titles in the document again
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        titleProcessor.stage = 1
+        # pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+        """
+        Discard pdf files by checking exceptions and return the error info to the caller
+        """
+        discardByException = DiscardByException()
+        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
+            pdf_dic, exception=DenseSingleLineBlockException()
+        )
+        is_discard_by_title_detection = discardByException.discard_by_title_detection(
+            pdf_dic, exception=TitleDetectionException()
+        )
+        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
+        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
+        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
+        if is_discard_by_single_line_block is not None:
+            error_info = is_discard_by_single_line_block
+        elif is_discard_by_title_detection is not None:
+            error_info = is_discard_by_title_detection
+        elif is_discard_by_title_level is not None:
+            error_info = is_discard_by_title_level
+        elif is_discard_by_split_para is not None:
+            error_info = is_discard_by_split_para
+        elif is_discard_by_merge_para is not None:
+            error_info = is_discard_by_merge_para
+        if error_info is not None:
+            return pdf_dic, error_info
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+        """
+        Draw the annotations
+        """
+        if para_debug_mode is not None:
+            drawAnnos = DrawAnnos()
+            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
+        """
+        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
+        """
+        if para_debug_mode is not None:
+            for fpath in os.listdir(output_dir):
+                if fpath.endswith(".json") and "stage" in fpath:
+                    os.remove(os.path.join(output_dir, fpath))
+        return pdf_dic, error_info
+"""
+Run this script to test the function with Command: 
+python pdf2text_recogPara.py [pdf_path] [output_pdf_path]
+Params:
+- pdf_path: the path of the pdf file
+- output_pdf_path: the path of the output pdf file
+"""
+if __name__ == "__main__":
+    DEFAULT_PDF_PATH = (
+        "app/pdf_toolbox/test/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\test\\assets\\paper\\paper.pdf"
+    )
+    input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
+    output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
+    output_json_path = sys.argv[3] if len(sys.argv) > 3 else input_pdf_path.split(".")[0] + "_recogPara.json"
+    import stat
+    # Remove existing output file if it exists
+    if os.path.exists(output_pdf_path):
+        os.chmod(output_pdf_path, stat.S_IWRITE)
+        os.remove(output_pdf_path)
+    input_pdf_doc = open_pdf(input_pdf_path)
+    # postprocess the paragraphs
+    paraProcessPipeline = ParaProcessPipeline()
+    # parse paragraph and save to json file
+    pdf_dic = {}
+    blockInnerParasProcessor = BlockTerminationProcessor()
+    """
+    Construct the pdf dictionary.
+    """
+    for page_id, page in enumerate(input_pdf_doc):  # type: ignore
+        # print(f"Processing page {page_id}")
+        # print(f"page: {page}")
+        raw_blocks = page.get_text("dict")["blocks"]
+        # Save text blocks to "preproc_blocks"
+        preproc_blocks = []
+        for block in raw_blocks:
+            if block["type"] == 0:
+                preproc_blocks.append(block)
+        layout_bboxes = []
+        # Construct the pdf dictionary as schema above
+        page_dict = {
+            "para_blocks": None,
+            "preproc_blocks": preproc_blocks,
+            "images": None,
+            "tables": None,
+            "interline_equations": None,
+            "inline_equations": None,
+            "layout_bboxes": None,
+            "pymu_raw_blocks": None,
+            "global_statistic": None,
+            "droped_text_block": None,
+            "droped_image_block": None,
+            "droped_table_block": None,
+            "image_backup": None,
+            "table_backup": None,
+        }
+        pdf_dic[f"page_{page_id}"] = page_dict
+    # print(f"pdf_dic: {pdf_dic}")
+    with open(output_json_path, "w", encoding="utf-8") as f:
+        json.dump(pdf_dic, f, ensure_ascii=False, indent=4)
+    pdf_dic = paraProcessPipeline.para_process_pipeline(output_json_path, input_pdf_doc, output_pdf_path)
--- a/pdf2text_recogTable.py
+++ b/pdf2text_recogTable.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 table ---------#
+    table_bbox_from_DocXChain = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 7 and xf['score'] >= 0.3:
+            table_bbox_from_DocXChain.append((L, U, R, D))
+    table_final_names = []
+    table_final_bboxs = []
+    table_ID = 0
+    for L, U, R, D in table_bbox_from_DocXChain:
+        # cur_table = page.get_pixmap(clip=(L,U,R,D))
+        new_table_name = "table_{}_{}.png".format(page_ID, table_ID)      # 表格name
+        # cur_table.save(res_dir_path + '/' + new_table_name)        # 把表格存出在新建的文件夹，并命名
+        table_final_names.append(new_table_name)                      # 把表格的名字存在list中，方便在md中插入引用
+        table_final_bboxs.append((L, U, R, D))
+        table_ID += 1
+    table_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_table_bboxs = table_final_bboxs
+    return curPage_all_table_bboxs
--- a/pdf2text_recogTitle.py
+++ b/pdf2text_recogTitle.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json             # json
+def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    #--------- 通过json_from_DocXchain来获取 title ---------#
+    title_bbox_from_DocXChain = []
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 0 and xf['score'] >= 0.3:
+            title_bbox_from_DocXChain.append((L, U, R, D))
+    title_final_names = []
+    title_final_bboxs = []
+    title_ID = 0
+    for L, U, R, D in title_bbox_from_DocXChain:
+        # cur_title = page.get_pixmap(clip=(L,U,R,D))
+        new_title_name = "title_{}_{}.png".format(page_ID, title_ID)    # 标题name
+        # cur_title.save(res_dir_path + '/' + new_title_name)           # 把标题存储在新建的文件夹，并命名
+        title_final_names.append(new_title_name)                        # 把标题的名字存在list中
+        title_final_bboxs.append((L, U, R, D))
+        title_ID += 1
+    title_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_title_bboxs = title_final_bboxs
+    return curPage_all_title_bboxs
--- a/pdf_parse_by_model.py
+++ b/pdf_parse_by_model.py
+import time
+# from anyio import Path
+from libs.commons import fitz, get_delta_time, get_img_s3_client
+import json
+import os
+import math
+from loguru import logger
+from layout.bbox_sort import (
+    prepare_bboxes_for_layout_split,
+)
+from layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
+from libs.drop_reason import DropReason
+from libs.markdown_utils import escape_special_markdown_char
+from libs.safe_filename import sanitize_filename
+from libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
+from pdf2text_recogFigure import parse_images
+from pdf2text_recogFootnoteLine import remove_headder_footer_one_page  # 获取figures的bbox
+from pdf2text_recogTable import parse_tables  # 获取tables的bbox
+from pdf2text_recogEquation import parse_equations  # 获取equations的bbox
+from pdf2text_recogHeader import parse_headers  # 获取headers的bbox
+from pdf2text_recogPageNo import parse_pageNos  # 获取pageNos的bbox
+from pdf2text_recogFootnote import parse_footnotes_by_model, parse_footnotes_by_rule  # 获取footnotes的bbox
+from pdf2text_recogFooter import parse_footers  # 获取footers的bbox
+from pdf2text_recogPara import (
+    ParaProcessPipeline,
+    TitleDetectionException,
+    TitleLevelException,
+    ParaSplitException,
+    ParaMergeException,
+    DenseSingleLineBlockException,
+)
+from pre_proc.main_text_font import get_main_text_font
+from pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
+'''
+from para.para_pipeline import ParaProcessPipeline
+from para.exceptions import (
+    TitleDetectionException,
+    TitleLevelException,
+    ParaSplitException,
+    ParaMergeException,
+    DenseSingleLineBlockException,
+)
+'''
+from libs.commons import read_file, join_path
+from libs.pdf_image_tools import save_images_by_bboxes
+from post_proc.footnote_remove import merge_footnote_blocks, remove_footnote_blocks
+from pre_proc.citationmarker_remove import remove_citation_marker
+from pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
+from pre_proc.pdf_filter import pdf_filter
+from pre_proc.detect_footer_header import drop_footer_header
+from pre_proc.construct_paras import construct_page_component
+from pre_proc.image_fix import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
+from post_proc.pdf_post_filter import pdf_post_filter
+from pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
+from pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
+from pre_proc.table_fix import fix_table_text_block, fix_tables, include_table_title
+denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
+titleDetectionException_msg = TitleDetectionException().message
+titleLevelException_msg = TitleLevelException().message
+paraSplitException_msg = ParaSplitException().message
+paraMergeException_msg = ParaMergeException().message
+def get_docx_model_output(pdf_model_output, pdf_model_s3_profile, page_id):
+    if isinstance(pdf_model_output, str):
+        model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")  # 模型输出的页面编号从1开始的
+        if os.path.exists(model_output_json_path):
+            json_from_docx = read_file(model_output_json_path, pdf_model_s3_profile)
+            model_output_json = json.loads(json_from_docx)
+        else:
+            try:
+                model_output_json_path = join_path(pdf_model_output, "model.json")
+                with open(model_output_json_path, "r", encoding="utf-8") as f:
+                    model_output_json = json.load(f)
+                    model_output_json = model_output_json["doc_layout_result"][page_id]
+            except:
+                s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")
+                s3_model_output_json_path = join_path(pdf_model_output, f"{page_id}.json")
+                #s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
+                # logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
+                s = read_file(s3_model_output_json_path, pdf_model_s3_profile)
+                return json.loads(s)
+    elif isinstance(pdf_model_output, list):
+        model_output_json = pdf_model_output[page_id]
+    return model_output_json
+def parse_pdf_by_model(
+    s3_pdf_path,
+    s3_pdf_profile,
+    pdf_model_output,
+    save_path,
+    book_name,
+    pdf_model_profile=None,
+    image_s3_config=None,
+    start_page_id=0,
+    end_page_id=None,
+    junk_img_bojids=[],
+    debug_mode=False,
+):
+    pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "..", "tmp", "unittest")
+    md_bookname_save_path = ""
+    book_name = sanitize_filename(book_name)
+    if debug_mode:
+        save_path = join_path(save_tmp_path, "md")
+        pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
+        if not os.path.exists(os.path.dirname(pdf_local_path)):
+            # 如果目录不存在，创建它
+            os.makedirs(os.path.dirname(pdf_local_path))
+        md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
+        if not os.path.exists(md_bookname_save_path):
+            # 如果目录不存在，创建它
+            os.makedirs(md_bookname_save_path)
+        with open(pdf_local_path + ".pdf", "wb") as pdf_file:
+            pdf_file.write(pdf_bytes)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    pdf_info_dict = {}
+    img_s3_client = get_img_s3_client(save_path, image_s3_config)  # 更改函数名和参数，避免歧义
+    # img_s3_client = "img_s3_client"  #不创建这个对象，直接用字符串占位
+    start_time = time.time()
+    """通过统计pdf全篇文字,识别正文字体"""
+    main_text_font = get_main_text_font(pdf_docs)
+    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+    for page_id in range(start_page_id, end_page_id + 1):
+        page = pdf_docs[page_id]
+        page_width = page.rect.width
+        page_height = page.rect.height
+        if debug_mode:
+            time_now = time.time()
+            logger.info(f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}")
+            start_time = time_now
+        """
+        # 通过一个规则，过滤掉单页超过1500非junkimg的pdf
+        # 对单页面非重复id的img数量做统计,如果当前页超过1500则直接return need_drop
+        """
+        page_imgs = page.get_images()
+        img_counts = 0
+        for img in page_imgs:
+            img_bojid = img[0]
+            if img_bojid in junk_img_bojids:  # 判断这个图片在不在junklist中
+                continue  # 如果在junklist就不用管了，跳过
+            else:
+                recs = page.get_image_rects(img, transform=True)
+                if recs:  # 如果这张图在当前页面有展示
+                    img_counts += 1
+        if img_counts >= 1500:  # 如果去除了junkimg的影响，单页img仍然超过1500的话，就排除当前pdf
+            logger.warning(
+                f"page_id: {page_id}, img_counts: {img_counts}, drop this pdf: {book_name}, drop_reason: {DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}"
+            )
+            result = {"need_drop": True, "drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}
+            if not debug_mode:
+                return result
+        """
+        ==================================================================================================================================
+        首先获取基本的block数据，对pdf进行分解，获取图片、表格、公式、text的bbox
+        """
+        # 解析pdf原始文本block
+        text_raw_blocks = page.get_text(
+            "dict",
+            flags=fitz.TEXTFLAGS_TEXT,
+        )["blocks"]
+        model_output_json = get_docx_model_output(pdf_model_output, pdf_model_profile, page_id)
+        # 解析图片
+        image_bboxes = parse_images(page_id, page, model_output_json, junk_img_bojids)
+        image_bboxes = fix_image_vertical(image_bboxes, text_raw_blocks)  # 修正图片的位置
+        image_bboxes = fix_seperated_image(image_bboxes)  # 合并有边重合的图片
+        image_bboxes = include_img_title(text_raw_blocks, image_bboxes)  # 向图片上方和下方寻找title，使用规则进行匹配，暂时只支持英文规则
+        """此时image_bboxes中可能出现这种情况，水平并列的2个图片，下方分别有各自的子标题，2个子标题下方又有大标题（形如Figxxx)，会出现2个图片的bbox都包含了这个大标题，这种情况需要把图片合并"""
+        image_bboxes = combine_images(image_bboxes)  # 合并图片
+        # 解析表格并对table_bboxes进行位置的微调,防止表格周围的文字被截断
+        table_bboxes = parse_tables(page_id, page, model_output_json)
+        table_bboxes = fix_tables(page, table_bboxes, include_table_title=True, scan_line_num=2)  # 修正
+        table_bboxes = fix_table_text_block(text_raw_blocks, table_bboxes)  # 修正与text block的关系,某些table修正与pymupdf获取到的table内textblock没有完全包含，因此要进行一次修正。
+        #debug_show_bbox(pdf_docs, page_id, table_bboxes, [], [b['bbox'] for b in text_raw_blocks], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7)
+        table_bboxes = include_table_title(text_raw_blocks, table_bboxes)  # 向table上方和下方寻找title，使用规则进行匹配，暂时只支持英文规则
+        # 解析公式
+        equations_inline_bboxes, equations_interline_bboxes = parse_equations(page_id, page, model_output_json)
+        """
+        ==================================================================================================================================
+        进入预处理-1阶段
+        -------------------
+        # # 解析标题
+        # title_bboxs = parse_titles(page_id, page, model_output_json)
+        # # 评估Layout是否规整、简单
+        # isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json)
+        接下来开始进行预处理过程
+        """
+        """去掉每页的页码、页眉、页脚"""
+        page_no_bboxs = parse_pageNos(page_id, page, model_output_json)
+        header_bboxs = parse_headers(page_id, page, model_output_json)
+        footer_bboxs = parse_footers(page_id, page, model_output_json)
+        image_bboxes, table_bboxes, remain_text_blocks, removed_hdr_foot_txt_block, removed_hdr_foot_img_block, removed_hdr_foot_table = remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, page_no_bboxs, page_width, page_height)
+        """去除页面上半部分长条色块内的文本块"""
+        remain_text_blocks, removed_colored_narrow_strip_background_text_block = remove_colored_strip_textblock(remain_text_blocks, page)
+        #debug_show_bbox(pdf_docs, page_id, footnote_bboxes_by_model, [b['bbox'] for b in remain_text_blocks], header_bboxs, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7)
+        """去掉旋转的文字：水印、垂直排列的文字"""
+        remain_text_blocks, removed_non_horz_text_block = remove_rotate_side_textblock(
+            remain_text_blocks, page_width, page_height
+        )  # 去掉水印，非水平文字
+        remain_text_blocks, removed_empty_side_block = remove_side_blank_block(remain_text_blocks, page_width, page_height) # 删除页面四周可能会留下的完全空白的textblock，这种block形成原因未知
+        """出现在图片、表格上的文字块去掉，把层叠的图片单独分离出来，不参与layout的计算"""
+        (
+            image_bboxes,
+            table_bboxes,
+            equations_interline_bboxes,
+            equations_inline_bboxes,
+            remain_text_blocks,
+            text_block_on_image_removed,
+            images_overlap_backup,
+            interline_eq_temp_text_block
+        ) = resolve_bbox_overlap_conflict(
+            image_bboxes, table_bboxes, equations_interline_bboxes, equations_inline_bboxes, remain_text_blocks
+        )
+        # """去掉footnote, 从文字和图片中"""
+        # # 通过模型识别到的footnote
+        # footnote_bboxes_by_model = parse_footnotes_by_model(page_id, page, model_output_json, md_bookname_save_path,
+        #                                                     debug_mode=debug_mode)
+        # # 通过规则识别到的footnote
+        # footnote_bboxes_by_rule = parse_footnotes_by_rule(remain_text_blocks, page_height, page_id)
+        """
+        ==================================================================================================================================
+        """
+        if debug_mode:  # debugmode截图到本地
+            save_path = join_path(save_tmp_path, "md")
+        # 把图、表、公式都进行截图，保存到存储上，返回图片路径作为内容
+        image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = save_images_by_bboxes(
+            book_name,
+            page_id,
+            page,
+            save_path,
+            image_bboxes,
+            images_overlap_backup,
+            table_bboxes,
+            equations_inline_bboxes,
+            equations_interline_bboxes,
+            # 传入img_s3_client
+            img_s3_client,
+        )  # 只要表格和图片的截图
+        """"以下进入到公式替换环节 """
+        char_level_text_blocks = page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)['blocks']
+        remain_text_blocks = combine_chars_to_pymudict(remain_text_blocks, char_level_text_blocks)# 合并chars
+        remain_text_blocks = remove_citation_marker(remain_text_blocks) # 先把角标去掉
+        remain_text_blocks = replace_equations_in_textblock(remain_text_blocks, inline_eq_info, interline_eq_info)
+        remain_text_blocks = remove_chars_in_text_blocks(remain_text_blocks) # 减少中间态数据体积
+        #debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3)
+        """去掉footnote, 从文字和图片中(先去角标再去footnote试试)"""
+        # 通过模型识别到的footnote
+        footnote_bboxes_by_model = parse_footnotes_by_model(page_id, page, model_output_json, md_bookname_save_path, debug_mode=debug_mode)
+        # 通过规则识别到的footnote
+        footnote_bboxes_by_rule = parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font)
+        """进入pdf过滤器，去掉一些不合理的pdf"""
+        is_good_pdf, err = pdf_filter(page, remain_text_blocks, table_bboxes, image_bboxes)
+        if not is_good_pdf:
+            logger.warning(f"page_id: {page_id}, drop this pdf: {book_name}, reason: {err}")
+            if not debug_mode:
+                return err
+        """
+        ==================================================================================================================================
+        进行版面布局切分和过滤
+        """
+        """在切分之前，先检查一下bbox是否有左右重叠的情况，如果有，那么就认为这个pdf暂时没有能力处理好，这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
+        is_text_block_horz_overlap = check_text_block_horizontal_overlap(remain_text_blocks, header_bboxs, footer_bboxs)
+        if is_text_block_horz_overlap:
+            # debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in remain_text_blocks], [], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 0)
+            logger.warning(f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
+            result = {"need_drop": True, "drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP}
+            if not debug_mode:
+                return result
+        """统一格式化成一个数据结构用于计算layout"""
+        page_y0 = 0 if len(header_bboxs) == 0 else max([b[3] for b in header_bboxs])
+        page_y1 = page_height if len(footer_bboxs) == 0 else min([b[1] for b in footer_bboxs])
+        left_x, right_x = get_side_boundry(removed_non_horz_text_block, page_width, page_height)
+        page_boundry = [math.floor(left_x), page_y0 + 1, math.ceil(right_x), page_y1 - 1]
+        # 返回的是一个数组，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是段落的内容
+        all_bboxes = prepare_bboxes_for_layout_split(
+            image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, remain_text_blocks, page_boundry, page)
+        #debug_show_bbox(pdf_docs, page_id, [], [], all_bboxes, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 1)
+        """page_y0, page_y1能够过滤掉页眉和页脚，不会算作layout内"""
+        layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
+        if len(remain_text_blocks)>0 and len(all_bboxes)>0 and len(layout_bboxes)==0:
+            logger.warning(f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
+            result = {"need_drop": True, "drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}
+            if not debug_mode:
+                return result
+        """以下去掉复杂的布局和超过2列的布局"""
+        if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]):  # 复杂的布局
+            logger.warning(f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.COMPLICATED_LAYOUT}")
+            result = {"need_drop": True, "drop_reason": DropReason.COMPLICATED_LAYOUT}
+            if not debug_mode:
+                return result
+        layout_column_width = get_columns_cnt_of_layout(layout_tree)
+        if layout_column_width > 2:  # 去掉超过2列的布局pdf
+            logger.warning(f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
+            result = {
+                "need_drop": True,
+                "drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS,
+                "extra_info": {"column_cnt": layout_column_width},
+            }
+            if not debug_mode:
+                return result
+        """
+        ==================================================================================================================================
+        构造出下游需要的数据结构
+        """
+        remain_text_blocks = remain_text_blocks + interline_eq_temp_text_block # 把计算layout时候临时删除的行间公式再放回去，防止行间公式替换的时候丢失。
+        removed_text_blocks = []
+        removed_text_blocks.extend(removed_hdr_foot_txt_block)
+        # removed_text_blocks.extend(removed_footnote_text_block)
+        removed_text_blocks.extend(text_block_on_image_removed)
+        removed_text_blocks.extend(removed_non_horz_text_block)
+        removed_text_blocks.extend(removed_colored_narrow_strip_background_text_block)
+        removed_images = []
+        # removed_images.extend(footnote_imgs)
+        removed_images.extend(removed_hdr_foot_img_block)
+        images_backup = []
+        images_backup.extend(image_backup_info)
+        remain_text_blocks = escape_special_markdown_char(remain_text_blocks) # 转义span里的text
+        sorted_text_remain_text_block = sort_text_block(remain_text_blocks, layout_bboxes)
+        footnote_bboxes_tmp = []
+        footnote_bboxes_tmp.extend(footnote_bboxes_by_model)
+        footnote_bboxes_tmp.extend(footnote_bboxes_by_rule)
+        page_info = construct_page_component(
+            page_id,
+            image_info,
+            table_info,
+            sorted_text_remain_text_block,
+            layout_bboxes,
+            inline_eq_info,
+            interline_eq_info,
+            page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"],
+            removed_text_blocks=removed_text_blocks,
+            removed_image_blocks=removed_images,
+            images_backup=images_backup,
+            droped_table_block=[],
+            table_backup=[],
+            layout_tree=layout_tree,
+            page_w=page.rect.width,
+            page_h=page.rect.height,
+            footnote_bboxes_tmp=footnote_bboxes_tmp
+        )
+        pdf_info_dict[f"page_{page_id}"] = page_info
+    # end page for
+    '''计算后处理阶段耗时'''
+    start_time = time.time()
+    """
+    ==================================================================================================================================
+    去掉页眉和页脚，这里需要用到一定的统计量，所以放到最后
+    页眉和页脚主要从文本box和图片box中去除，位于页面的四周。
+    下面函数会直接修改pdf_info_dict,从文字块中、图片中删除属于页眉页脚的内容，删除内容做相对应记录
+    """
+    # 去页眉页脚
+    header, footer = drop_footer_header(pdf_info_dict)
+    """对单个layout内footnote和他下面的所有textbbox合并"""
+    for page_key, page_info in pdf_info_dict.items():
+        page_info = merge_footnote_blocks(page_info, main_text_font)
+        page_info = remove_footnote_blocks(page_info)
+        pdf_info_dict[page_key] = page_info
+    """进入pdf后置过滤器，去掉一些不合理的pdf"""
+    i = 0
+    for page_info in pdf_info_dict.values():
+        is_good_pdf, err = pdf_post_filter(page_info)
+        if not is_good_pdf:
+            logger.warning(f"page_id: {i}, drop this pdf: {book_name}, reason: {err}")
+            if not debug_mode:
+                return err
+        i += 1
+    if debug_mode:
+        params_file_save_path = join_path(save_tmp_path, "md", book_name, "preproc_out.json")
+        page_draw_rect_save_path = join_path(save_tmp_path, "md", book_name, "layout.pdf")
+        # dir_path = os.path.dirname(page_draw_rect_save_path)
+        # if not os.path.exists(dir_path):
+        #     # 如果目录不存在，创建它
+        #     os.makedirs(dir_path)
+        with open(params_file_save_path, "w", encoding="utf-8") as f:
+            json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+        # 先检测本地 page_draw_rect_save_path 是否存在，如果存在则删除
+        if os.path.exists(page_draw_rect_save_path):
+            os.remove(page_draw_rect_save_path)
+        # 绘制bbox和layout到pdf
+        draw_bbox_on_page(pdf_docs, pdf_info_dict, page_draw_rect_save_path)
+        draw_layout_bbox_on_page(pdf_docs, pdf_info_dict, header, footer, page_draw_rect_save_path)
+    if debug_mode:
+        # 打印后处理阶段耗时
+        logger.info(f"post_processing_time: {get_delta_time(start_time)}")
+    """
+    ==================================================================================================================================
+    进入段落处理-2阶段
+    """
+    start_time = time.time()
+    para_process_pipeline = ParaProcessPipeline()
+    def _deal_with_text_exception(error_info):
+        logger.warning(f"page_id: {page_id}, drop this pdf: {book_name}, reason: {error_info}")
+        if error_info == denseSingleLineBlockException_msg:
+            logger.warning(f"Drop this pdf: {book_name}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}")
+            result = {"need_drop": True, "drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK}
+            return result
+        if error_info == titleDetectionException_msg:
+            logger.warning(f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_DETECTION_FAILED}")
+            result = {"need_drop": True, "drop_reason": DropReason.TITLE_DETECTION_FAILED}
+            return result
+        elif error_info == titleLevelException_msg:
+            logger.warning(f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_LEVEL_FAILED}")
+            result = {"need_drop": True, "drop_reason": DropReason.TITLE_LEVEL_FAILED}
+            return result
+        elif error_info == paraSplitException_msg:
+            logger.warning(f"Drop this pdf: {book_name}, reason: {DropReason.PARA_SPLIT_FAILED}")
+            result = {"need_drop": True, "drop_reason": DropReason.PARA_SPLIT_FAILED}
+            return result
+        elif error_info == paraMergeException_msg:
+            logger.warning(f"Drop this pdf: {book_name}, reason: {DropReason.PARA_MERGE_FAILED}")
+            result = {"need_drop": True, "drop_reason": DropReason.PARA_MERGE_FAILED}
+            return result
+    if debug_mode:
+        input_pdf_file = f"{pdf_local_path}.pdf"
+        output_dir = f"{save_path}/{book_name}"
+        output_pdf_file = f"{output_dir}/pdf_annos.pdf"
+        """
+        Call the para_process_pipeline function to process the pdf_info_dict.
+        Parameters:
+        para_debug_mode: str or None
+            If para_debug_mode is None, the para_process_pipeline will not keep any intermediate results.
+            If para_debug_mode is "simple", the para_process_pipeline will only keep the annos on the pdf and the final results as a json file.
+            If para_debug_mode is "full", the para_process_pipeline will keep all the intermediate results generated during each step.
+        """
+        pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline(
+            pdf_info_dict,
+            para_debug_mode="simple",
+            input_pdf_path=input_pdf_file,
+            output_pdf_path=output_pdf_file,
+        )
+        # 打印段落处理阶段耗时
+        logger.info(f"para_process_time: {get_delta_time(start_time)}")
+        # debug的时候不return drop信息
+        if error_info is not None:
+            _deal_with_text_exception(error_info)
+        return pdf_info_dict
+    else:
+        pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline(pdf_info_dict)
+        if error_info is not None:
+            return _deal_with_text_exception(error_info)
+    return pdf_info_dict
--- a/post_proc/__init__.py
+++ b/post_proc/__init__.py
--- a/post_proc/footnote_remove.py
+++ b/post_proc/footnote_remove.py
+from libs.boxbase import _is_in
+from pdf2text_recogFootnoteLine import remove_footnote_text, remove_footnote_image
+import collections      # 统计库
+def is_below(bbox1, bbox2):
+    # 如果block1的上边y坐标大于block2的下边y坐标，那么block1在block2下面
+    return bbox1[1] > bbox2[3]
+def merge_bboxes(bboxes):
+    # 找出所有blocks的最小x0，最大y1，最大x1，最小y0，这就是合并后的bbox
+    x0 = min(bbox[0] for bbox in bboxes)
+    y0 = min(bbox[1] for bbox in bboxes)
+    x1 = max(bbox[2] for bbox in bboxes)
+    y1 = max(bbox[3] for bbox in bboxes)
+    return [x0, y0, x1, y1]
+def merge_footnote_blocks(page_info, main_text_font):
+    page_info['merged_bboxes'] = []
+    for layout in page_info['layout_bboxes']:
+        # 找出layout中的所有footnote blocks和preproc_blocks
+        footnote_bboxes = [block for block in page_info['footnote_bboxes_tmp'] if _is_in(block, layout['layout_bbox'])]
+        # 如果没有footnote_blocks，就跳过这个layout
+        if not footnote_bboxes:
+            continue
+        preproc_blocks = [block for block in page_info['preproc_blocks'] if _is_in(block['bbox'], layout['layout_bbox'])]
+        # preproc_bboxes = [block['bbox'] for block in preproc_blocks]
+        font_names = collections.Counter()
+        if len(preproc_blocks) > 0:
+            # 存储每一行的文本块大小的列表
+            line_sizes = []
+            # 存储每个文本块的平均行大小
+            block_sizes = []
+            for block in preproc_blocks:
+                block_line_sizes = []
+                block_fonts = collections.Counter()
+                for line in block['lines']:
+                    # 提取每个span的size属性，并计算行大小
+                    span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
+                    if span_sizes:
+                        line_size = sum(span_sizes) / len(span_sizes)
+                        line_sizes.append(line_size)
+                        block_line_sizes.append(line_size)
+                    span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
+                                 'font' in span and len(span['text']) > 0]
+                    if span_font:
+                        # # todo main_text_font应该用基于字数最多的字体而不是span级别的统计
+                        # font_names.append(font_name for font_name in span_font)
+                        # block_fonts.append(font_name for font_name in span_font)
+                        for font, count in span_font:
+                            # font_names.extend([font] * count)
+                            # block_fonts.extend([font] * count)
+                            font_names[font] += count
+                            block_fonts[font] += count
+                if block_line_sizes:
+                    # 计算文本块的平均行大小
+                    block_size = sum(block_line_sizes) / len(block_line_sizes)
+                    block_font = block_fonts.most_common(1)[0][0]
+                    block_sizes.append((block, block_size, block_font))
+            # 计算main_text_size
+            # main_text_font = font_names.most_common(1)[0][0]
+            main_text_size = collections.Counter(line_sizes).most_common(1)[0][0]
+        else:
+            continue
+        need_merge_bboxes = []
+        # 任何一个下面有正文block的footnote bbox都是假footnote
+        for footnote_bbox in footnote_bboxes:
+            # 检测footnote下面是否有正文block(正文block需满足，block平均size大于等于main_text_size，且block行数大于等于5)
+            main_text_bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if
+                                      is_below(block['bbox'], footnote_bbox) and
+                                      sum([size >= main_text_size,
+                                           len(block['lines']) >= 5,
+                                           block_font == main_text_font]) >= 2]
+            # 如果main_text_bboxes_below不为空，说明footnote下面有正文block，这个footnote不成立，跳过
+            if len(main_text_bboxes_below) > 0:
+                continue
+            else:
+                # 否则，说明footnote下面没有正文block，这个footnote成立，添加到待merge的footnote_bboxes中
+                need_merge_bboxes.append(footnote_bbox)
+        if len(need_merge_bboxes) == 0:
+            continue
+        # 找出最靠上的footnote block
+        top_footnote_bbox = min(need_merge_bboxes, key=lambda bbox: bbox[1])
+        # 找出所有在top_footnote_block下面的preproc_blocks，并确保这些preproc_blocks的平均行大小小于main_text_size
+        bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if is_below(block['bbox'], top_footnote_bbox)]
+        # # 找出所有在top_footnote_block下面的preproc_blocks
+        # bboxes_below = [bbox for bbox in preproc_bboxes if is_below(bbox, top_footnote_bbox)]
+        # 合并top_footnote_block和blocks_below
+        merged_bbox = merge_bboxes([top_footnote_bbox] + bboxes_below)
+        # 添加到新的footnote_bboxes_tmp中
+        page_info['merged_bboxes'].append(merged_bbox)
+    return page_info
+def remove_footnote_blocks(page_info):
+    if page_info.get('merged_bboxes'):
+        # 从文字中去掉footnote
+        remain_text_blocks, removed_footnote_text_blocks = remove_footnote_text(page_info['preproc_blocks'], page_info['merged_bboxes'])
+        # 从图片中去掉footnote
+        image_blocks, removed_footnote_imgs_blocks = remove_footnote_image(page_info['images'], page_info['merged_bboxes'])
+        # 更新page_info
+        page_info['preproc_blocks'] = remain_text_blocks
+        page_info['images'] = image_blocks
+        page_info['droped_text_block'].extend(removed_footnote_text_blocks)
+        page_info['droped_image_block'].extend(removed_footnote_imgs_blocks)
+        # 删除footnote_bboxes_tmp和merged_bboxes
+        del page_info['merged_bboxes']
+    del page_info['footnote_bboxes_tmp']
+    return page_info
--- a/post_proc/pdf_post_filter.py
+++ b/post_proc/pdf_post_filter.py
+from loguru import logger
+from layout.layout_sort import get_columns_cnt_of_layout
+from libs.drop_reason import DropReason
+def __is_pseudo_single_column(page_info) -> bool:
+    """
+    判断一个页面是否伪单列。
+    Args:
+        page_info (dict): 页面信息字典，包括'_layout_tree'和'preproc_blocks'。
+    Returns:
+        Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info)，否则返回(False, None)。
+    """
+    layout_tree = page_info['_layout_tree']
+    layout_column_width = get_columns_cnt_of_layout(layout_tree)
+    if layout_column_width == 1:
+        text_blocks = page_info['preproc_blocks']
+        # 遍历每一个text_block
+        for text_block in text_blocks:
+            lines = text_block['lines']
+            num_lines = len(lines)
+            num_satisfying_lines = 0
+            for i in range(num_lines - 1):
+                current_line = lines[i]
+                next_line = lines[i + 1]
+                # 获取当前line和下一个line的bbox属性
+                current_bbox = current_line['bbox']
+                next_bbox = next_line['bbox']
+                # 检查是否满足条件
+                if next_bbox[0] > current_bbox[2] or next_bbox[2] < current_bbox[0]:
+                    num_satisfying_lines += 1
+            # 如果有一半以上的line满足条件，就drop
+            # print("num_satisfying_lines:", num_satisfying_lines, "num_lines:", num_lines)
+            if num_lines > 20:
+                radio = num_satisfying_lines / num_lines
+                if radio >= 0.5:
+                    extra_info = f"{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}"
+                    block_text = []
+                    for line in lines:
+                        if line['spans']:
+                            for span in line['spans']:
+                                block_text.append(span['text'])
+                    logger.warning(f"pseudo_single_column block_text: {block_text}")
+                    return True, extra_info
+    return False, None
+def pdf_post_filter(page_info) -> tuple:
+    """
+    return:(True|False, err_msg)
+        True, 如果pdf符合要求
+        False, 如果pdf不符合要求
+    """
+    bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
+    if bool_is_pseudo_single_column:
+        return False, {"need_drop": True, "drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
+    return True, None
\ No newline at end of file