Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files

Update magic_pdf/init.py, magic_pdf/config.ini, magic_pdf/tmp.py,...
Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
2df265c8 · zhougaofeng · 826086d2 · 2df265c8 · 2df265c8 · 2df265c8
Commit 2df265c8 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/para/__init__.py
+++ b/magic_pdf/para/__init__.py
--- a/magic_pdf/para/block_continuation_processor.py
+++ b/magic_pdf/para/block_continuation_processor.py
+import os
+import unicodedata
+
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class BlockContinuationProcessor:
+    """
+    This class is used to process the blocks to detect block continuations.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
+        """
+        This function checks if the two font types are similar.
+        Definition of similar font types: the two font types have a common prefix,
+        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
+
+        Parameters
+        ----------
+        font_type1 : str
+            font type 1
+        font_type2 : str
+            font type 2
+        prefix_length_ratio : float
+            minimum ratio of the common prefix length to the length of the shorter font type
+
+        Returns
+        -------
+        bool
+            True if the two font types are similar, False otherwise.
+        """
+
+        if isinstance(font_type1, list):
+            font_type1 = font_type1[0] if font_type1 else ""
+        if isinstance(font_type2, list):
+            font_type2 = font_type2[0] if font_type2 else ""
+
+        if font_type1 == font_type2:
+            return True
+
+        # Find the length of the common prefix
+        common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
+
+        # Calculate the minimum prefix length based on the ratio
+        min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
+
+        return common_prefix_length >= min_prefix_length
+
+    def __is_same_block_font(self, block1, block2):
+        """
+        This function compares the font of block1 and block2
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 have the same font, else False
+        """
+        block_1_font_type = safe_get(block1, "block_font_type", "")
+        block_1_font_size = safe_get(block1, "block_font_size", 0)
+        block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
+
+        block_2_font_type = safe_get(block2, "block_font_type", "")
+        block_2_font_size = safe_get(block2, "block_font_size", 0)
+        block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
+
+        if isinstance(block_1_font_size, list):
+            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
+        if isinstance(block_2_font_size, list):
+            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
+
+        block_1_text = safe_get(block1, "text", "")
+        block_2_text = safe_get(block2, "text", "")
+
+        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
+            return False
+
+        if not block_1_text or not block_2_text:
+            return False
+        else:
+            text_len_ratio = len(block_2_text) / len(block_1_text)
+            if text_len_ratio < 0.2:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.5
+                )
+            else:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.2
+                )
+
+        block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
+
+        return (
+            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
+            and avg_char_width_condition
+            and block_font_size_condtion
+        )
+
+    def _is_alphabet_char(self, char):
+        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
+            return True
+        else:
+            return False
+
+    def _is_chinese_char(self, char):
+        if char >= "\u4e00" and char <= "\u9fa5":
+            return True
+        else:
+            return False
+
+    def _is_other_letter_char(self, char):
+        try:
+            cat = unicodedata.category(char)
+            if cat == "Lu" or cat == "Ll":
+                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
+        except TypeError:
+            print("The input to the function must be a single character.")
+        return False
+
+    def _is_year(self, s: str):
+        try:
+            number = int(s)
+            return 1900 <= number <= 2099
+        except ValueError:
+            return False
+
+    def __is_para_font_consistent(self, para_1, para_2):
+        """
+        This function compares the font of para1 and para2
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 have the same font, else False
+        """
+        if para_1 is None or para_2 is None:
+            return False
+
+        para_1_font_type = safe_get(para_1, "para_font_type", "")
+        para_1_font_size = safe_get(para_1, "para_font_size", 0)
+        para_1_font_color = safe_get(para_1, "para_font_color", "")
+
+        para_2_font_type = safe_get(para_2, "para_font_type", "")
+        para_2_font_size = safe_get(para_2, "para_font_size", 0)
+        para_2_font_color = safe_get(para_2, "para_font_color", "")
+
+        if isinstance(para_1_font_type, list):  # get the most common font type
+            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
+        if isinstance(para_2_font_type, list):
+            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
+        if isinstance(para_1_font_size, list):  # compute average font type
+            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
+        if isinstance(para_2_font_size, list):  # compute average font type
+            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
+
+        return (
+            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
+            and abs(para_1_font_size - para_2_font_size) < 1.5
+            # and para_font_color1 == para_font_color2
+        )
+
+    def _is_para_puncs_consistent(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph by using the puncs, else False
+        """
+        para_1_text = safe_get(para_1, "para_text", "").strip()
+        para_2_text = safe_get(para_2, "para_text", "").strip()
+
+        para_1_bboxes = safe_get(para_1, "para_bbox", [])
+        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
+
+        para_2_bboxes = safe_get(para_2, "para_bbox", [])
+        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
+
+        # print_yellow("    Features of determine puncs_consistent:")
+        # print(f"    para_1_text: {para_1_text}")
+        # print(f"    para_2_text: {para_2_text}")
+        # print(f"    para_1_bboxes: {para_1_bboxes}")
+        # print(f"    para_2_bboxes: {para_2_bboxes}")
+        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
+        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
+
+        if is_nested_list(para_1_bboxes):
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
+        else:
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
+
+        if is_nested_list(para_2_bboxes):
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
+            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
+        else:
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
+
+        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
+
+        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
+        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
+
+        # Check if either para_text1 or para_text2 is empty
+        if not para_1_text or not para_2_text:
+            return False
+
+        # Define the end puncs for a sentence to end and hyphen
+        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+        hyphen = ["-", "—"]
+
+        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
+        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
+        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
+        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
+        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
+
+        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
+        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
+        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
+        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
+
+        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
+            # print_red(f"para_1 is end with hyphen.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] in hyphen
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
+            # print_red(f"para_1 is end with end_punc.")
+            para_2_is_consistent = (
+                para_2_text
+                and (
+                    para_2_text[0] == " "
+                    or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
+                    or (self._is_chinese_char(para_2_text[0]))
+                    or (self._is_other_letter_char(para_2_text[0]))
+                )
+                and not is_para2_left_indent_than_papa1
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
+            # print_red(f"para_1 is NOT end with end_punc.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_alphabet_char(para_2_text[0]))
+                or (self._is_year(para_2_text[0:4]))
+                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_space:  # If para_text1 ends with space
+            # print_red(f"para_1 is end with space.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                pass
+                # print(f"para_2 is not consistent.\n")
+
+        return False
+
+    def _is_block_consistent(self, block1, block2):
+        """
+        This function determines whether block1 and block2 are originally from the same block
+
+        Parameters
+        ----------
+        block1 : dict
+            block1s
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 are from the same block, else False
+        """
+        return self.__is_same_block_font(block1, block2)
+
+    def _is_para_continued(self, para1, para2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph, else False
+        """
+        is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
+        is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
+
+        return is_para_font_consistent and is_para_puncs_consistent
+
+    def _are_boundaries_of_block_consistent(self, block1, block2):
+        """
+        This function checks if the boundaries of block1 and block2 are consistent
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_consistent : bool
+            True if the boundaries of block1 and block2 are consistent, else False
+        """
+
+        last_line_of_block1 = block1["lines"][-1]
+        first_line_of_block2 = block2["lines"][0]
+
+        spans_of_last_line_of_block1 = last_line_of_block1["spans"]
+        spans_of_first_line_of_block2 = first_line_of_block2["spans"]
+
+        font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
+        font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
+        font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
+        font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
+
+        font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
+        font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
+        font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
+        font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
+
+        return (
+            self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
+            and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
+            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
+            and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
+        )
+
+    def _get_last_paragraph(self, block):
+        """
+        Retrieves the last paragraph from a block.
+
+        Parameters
+        ----------
+        block : dict
+            The block from which to retrieve the paragraph.
+
+        Returns
+        -------
+        dict
+            The last paragraph of the block.
+        """
+        if block["paras"]:
+            last_para_key = list(block["paras"].keys())[-1]
+            return block["paras"][last_para_key]
+        else:
+            return None
+
+    def _get_first_paragraph(self, block):
+        """
+        Retrieves the first paragraph from a block.
+
+        Parameters
+        ----------
+        block : dict
+            The block from which to retrieve the paragraph.
+
+        Returns
+        -------
+        dict
+            The first paragraph of the block.
+        """
+        if block["paras"]:
+            first_para_key = list(block["paras"].keys())[0]
+            return block["paras"][first_para_key]
+        else:
+            return None
+
+    def should_merge_next_para(self, curr_para, next_para):
+        if self._is_para_continued(curr_para, next_para):
+            return True
+        else:
+            return False
+
+    def batch_tag_paras(self, pdf_dict):
+        the_last_page_id = len(pdf_dict) - 1
+
+        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
+            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
+                para_blocks_of_curr_page = curr_page_content["para_blocks"]
+                next_page_idx = curr_page_idx + 1
+                next_page_id = f"page_{next_page_idx}"
+                next_page_content = pdf_dict.get(next_page_id, {})
+
+                for i, current_block in enumerate(para_blocks_of_curr_page):
+                    for para_id, curr_para in current_block["paras"].items():
+                        curr_para["curr_para_location"] = [
+                            curr_page_idx,
+                            current_block["block_id"],
+                            int(para_id.split("_")[-1]),
+                        ]
+                        curr_para["next_para_location"] = None  # 默认设置为None
+                        curr_para["merge_next_para"] = False  # 默认设置为False
+
+                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
+
+                    if next_block:
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        next_block_first_para_key = list(next_block["paras"].keys())[0]
+                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
+
+                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                            curr_blk_last_para["next_para_location"] = [
+                                curr_page_idx,
+                                next_block["block_id"],
+                                int(next_block_first_para_key.split("_")[-1]),
+                            ]
+                            curr_blk_last_para["merge_next_para"] = True
+                    else:
+                        # Handle the case where the next block is in a different page
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
+                            next_page_idx += 1
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id, {})
+
+                        if next_page_content.get("para_blocks", []):
+                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
+                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
+
+                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                                curr_blk_last_para["next_para_location"] = [
+                                    next_page_idx,
+                                    next_page_content["para_blocks"][0]["block_id"],
+                                    int(next_blk_first_para_key.split("_")[-1]),
+                                ]
+                                curr_blk_last_para["merge_next_para"] = True
+
+        return pdf_dict
+
+    def find_block_by_id(self, para_blocks, block_id):
+        for block in para_blocks:
+            if block.get("block_id") == block_id:
+                return block
+        return None
+
+    def batch_merge_paras(self, pdf_dict):
+        for page_id, page_content in pdf_dict.items():
+            if page_id.startswith("page_") and page_content.get("para_blocks", []):
+                para_blocks_of_page = page_content["para_blocks"]
+
+                for i in range(len(para_blocks_of_page)):
+                    current_block = para_blocks_of_page[i]
+                    paras = current_block["paras"]
+
+                    for para_id, curr_para in list(paras.items()):
+                        # 跳过标题段落
+                        if curr_para.get("is_para_title"):
+                            continue
+
+                        while curr_para.get("merge_next_para"):
+                            next_para_location = curr_para.get("next_para_location")
+                            if not next_para_location:
+                                break
+
+                            next_page_idx, next_block_id, next_para_id = next_para_location
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id)
+                            if not next_page_content:
+                                break
+
+                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
+                            if not next_block:
+                                break
+
+                            next_para = next_block["paras"].get(f"para_{next_para_id}")
+                            if not next_para or next_para.get("is_para_title"):
+                                break
+
+                            # 合并段落文本
+                            curr_para_text = curr_para.get("para_text", "")
+                            next_para_text = next_para.get("para_text", "")
+                            curr_para["para_text"] = curr_para_text + " " + next_para_text
+
+                            # 更新 next_para_location
+                            curr_para["next_para_location"] = next_para.get("next_para_location")
+
+                            # 将下一个段落文本置为空，表示已被合并
+                            next_para["para_text"] = ""
+
+                            # 更新 merge_next_para 标记
+                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
+
+        return pdf_dict
--- a/magic_pdf/para/block_termination_processor.py
+++ b/magic_pdf/para/block_termination_processor.py
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+
+class BlockTerminationProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def _is_consistent_lines(
+        self,
+        curr_line,
+        prev_line,
+        next_line,
+        consistent_direction,  # 0 for prev, 1 for next, 2 for both
+    ):
+        """
+        This function checks if the line is consistent with its neighbors
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        consistent_direction : int
+            0 for prev, 1 for next, 2 for both
+
+        Returns
+        -------
+        bool
+            True if the line is consistent with its neighbors, False otherwise.
+        """
+
+        curr_line_font_size = curr_line["spans"][0]["size"]
+        curr_line_font_type = curr_line["spans"][0]["font"].lower()
+
+        if consistent_direction == 0:
+            if prev_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 1:
+            if next_line:
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 2:
+            if prev_line and next_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
+                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+                )
+            else:
+                return False
+
+        else:
+            return False
+
+    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
+        """
+        This function checks if the line is a regular line
+
+        Parameters
+        ----------
+        curr_line_bbox : list
+            bbox of the current line
+        prev_line_bbox : list
+            bbox of the previous line
+        next_line_bbox : list
+            bbox of the next line
+        avg_char_width : float
+            average of char widths
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a regular line, False otherwise.
+        """
+        horizontal_ratio = 0.5
+        vertical_ratio = 0.5
+        horizontal_thres = horizontal_ratio * avg_char_width
+        vertical_thres = vertical_ratio * avg_line_height
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        x0_near_X0 = abs(x0 - X0) < horizontal_thres
+        x1_near_X1 = abs(x1 - X1) < horizontal_thres
+
+        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+
+        return (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (not x0_near_X0 and not x1_near_X1)
+            or prev_line_is_end_of_para
+        )
+
+    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
+        """
+        This function checks if the line is a possible start of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a possible start of a paragraph, False otherwise.
+        """
+        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        indent_ratio = 1
+
+        vertical_ratio = 1.5
+        vertical_thres = vertical_ratio * avg_font_size
+
+        left_horizontal_ratio = 0.5
+        left_horizontal_thres = left_horizontal_ratio * avg_char_width
+
+        right_horizontal_ratio = 2.5
+        right_horizontal_thres = right_horizontal_ratio * avg_char_width
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        indent_condition = x0 > X0 + indent_ratio * avg_char_width
+        if indent_condition:
+            start_confidence += 0.2
+            decision_path.append("indent_condition_met")
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
+        if x0_near_X0:
+            start_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
+        if x1_near_X1:
+            start_confidence += 0.1
+            decision_path.append("x1_near_X1")
+
+        if prev_line is None:
+            prev_line_is_end_of_para = True
+            start_confidence += 0.2
+            decision_path.append("no_prev_line")
+        else:
+            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
+            if prev_line_is_end_of_para:
+                start_confidence += 0.1
+                decision_path.append("prev_line_is_end_of_para")
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            if sufficient_spacing_above:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_above")
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            if sufficient_spacing_below:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_below")
+
+        is_regular_line = self._is_regular_line(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
+        )
+        if is_regular_line:
+            start_confidence += 0.1
+            decision_path.append("is_regular_line")
+
+        is_start_of_para = (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (indent_condition)
+            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
+            or prev_line_is_end_of_para
+        )
+        return (is_start_of_para, start_confidence, decision_path)
+
+    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
+        """
+        This function checks if the line is a possible end of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+
+        Returns
+        -------
+        bool
+            True if the line is a possible end of a paragraph, False otherwise.
+        """
+
+        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        left_horizontal_ratio = 0.5
+        right_horizontal_ratio = 0.5
+
+        x0, _, x1, y1 = curr_line_bbox
+        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
+        if x0_near_X0:
+            end_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
+        if x1_smaller_than_X1:
+            end_confidence += 0.1
+            decision_path.append("x1_smaller_than_X1")
+
+        next_line_is_start_of_para = (
+            next_line_bbox
+            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
+            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
+        )
+        if next_line_is_start_of_para:
+            end_confidence += 0.2
+            decision_path.append("next_line_is_start_of_para")
+
+        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if is_line_left_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_left_aligned_from_neighbors")
+
+        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if not is_line_right_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_not_right_aligned_from_neighbors")
+
+        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
+            (x0_near_X0 and x1_smaller_than_X1)
+            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
+        )
+
+        return (is_end_of_para, end_confidence, decision_path)
+
+    def _cut_paras_per_block(
+        self,
+        block,
+    ):
+        """
+        Processes a raw block from PyMuPDF and returns the processed block.
+
+        Parameters
+        ----------
+        raw_block : dict
+            A raw block from pymupdf.
+
+        Returns
+        -------
+        processed_block : dict
+
+        """
+
+        def _construct_para(lines, is_block_title, para_title_level):
+            """
+            Construct a paragraph from given lines.
+            """
+
+            font_sizes = [span["size"] for line in lines for span in line["spans"]]
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
+
+            font_colors = [span["color"] for line in lines for span in line["spans"]]
+            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
+
+            # font_types = [span["font"] for line in lines for span in line["spans"]]
+            # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
+
+            font_type_lengths = {}
+            for line in lines:
+                for span in line["spans"]:
+                    font_type = span["font"]
+                    bbox_width = span["bbox"][2] - span["bbox"][0]
+                    if font_type in font_type_lengths:
+                        font_type_lengths[font_type] += bbox_width
+                    else:
+                        font_type_lengths[font_type] = bbox_width
+
+            # get the font type with the longest bbox width
+            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
+
+            para_bbox = calculate_para_bbox(lines)
+            para_text = " ".join(line["text"] for line in lines)
+
+            return {
+                "para_bbox": para_bbox,
+                "para_text": para_text,
+                "para_font_type": most_common_font_type,
+                "para_font_size": avg_font_size,
+                "para_font_color": most_common_font_color,
+                "is_para_title": is_block_title,
+                "para_title_level": para_title_level,
+            }
+
+        block_bbox = block["bbox"]
+        block_text = block["text"]
+        block_lines = block["lines"]
+
+        X0 = safe_get(block, "X0", 0)
+        X1 = safe_get(block, "X1", 0)
+        avg_char_width = safe_get(block, "avg_char_width", 0)
+        avg_char_height = safe_get(block, "avg_char_height", 0)
+        avg_font_size = safe_get(block, "avg_font_size", 0)
+
+        is_block_title = safe_get(block, "is_block_title", False)
+        para_title_level = safe_get(block, "block_title_level", 0)
+
+        # Segment into paragraphs
+        para_ranges = []
+        in_paragraph = False
+        start_idx_of_para = None
+
+        # Create the processed paragraphs
+        processed_paras = {}
+        para_bboxes = []
+        end_idx_of_para = 0
+
+        for line_index, line in enumerate(block_lines):
+            curr_line = line
+            prev_line = block_lines[line_index - 1] if line_index > 0 else None
+            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
+
+            """
+            Start processing paragraphs.
+            """
+
+            # Check if the line is the start of a paragraph
+            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
+                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
+            )
+            if not in_paragraph and is_start_of_para:
+                in_paragraph = True
+                start_idx_of_para = line_index
+
+                # print_green(">>> Start of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    start_confidence: ", start_confidence)
+                # print("    decision_path: ", decision_path)
+
+            # Check if the line is the end of a paragraph
+            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
+                curr_line, next_line, X0, X1, avg_char_width
+            )
+            if in_paragraph and (is_end_of_para or not next_line):
+                para_ranges.append((start_idx_of_para, line_index))
+                start_idx_of_para = None
+                in_paragraph = False
+
+                # print_red(">>> End of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    end_confidence: ", end_confidence)
+                # print("    decision_path: ", decision_path)
+
+        # Add the last paragraph if it is not added
+        if in_paragraph and start_idx_of_para is not None:
+            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
+
+        # Process the matched paragraphs
+        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
+            matched_lines = block_lines[start_idx : end_idx + 1]
+            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
+            para_key = f"para_{len(processed_paras)}"
+            processed_paras[para_key] = para_properties
+            para_bboxes.append(para_properties["para_bbox"])
+            end_idx_of_para = end_idx + 1
+
+        # Deal with the remaining lines
+        if end_idx_of_para < len(block_lines):
+            unmatched_lines = block_lines[end_idx_of_para:]
+            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
+            unmatched_key = f"para_{len(processed_paras)}"
+            processed_paras[unmatched_key] = unmatched_properties
+            para_bboxes.append(unmatched_properties["para_bbox"])
+
+        block["paras"] = processed_paras
+
+        return block
+
+    def batch_process_blocks(self, pdf_dict):
+        """
+        Parses the blocks of all pages.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        filter_blocks : list
+            List of bounding boxes to filter.
+
+        Returns
+        -------
+        result_dict : dict
+            Result dictionary.
+
+        """
+
+        num_paras = 0
+
+        for page_id, page in pdf_dict.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page.keys():
+                    input_blocks = page["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self._cut_paras_per_block(input_block)
+                        para_blocks.append(new_block)
+                        num_paras += len(new_block["paras"])
+
+                page["para_blocks"] = para_blocks
+
+        pdf_dict["statistics"]["num_paras"] = num_paras
+        return pdf_dict
--- a/magic_pdf/para/commons.py
+++ b/magic_pdf/para/commons.py
+import sys
+
+from magic_pdf.libs.commons import fitz
+from termcolor import cprint
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+def open_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)  # type: ignore
+        return pdf_document
+    except Exception as e:
+        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
+        raise e
+
+
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+
+
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+
+
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+
+
+def print_yellow(text):
+    print()
+    cprint(text, "yellow", attrs=["bold"], end="\n\n")
+
+
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+
+
+def is_bbox_overlap(bbox1, bbox2):
+    """
+    This function checks if bbox1 and bbox2 overlap or not
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 and bbox2 overlap, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 > x1_2 or x0_2 > x1_1:
+        return False
+    if y0_1 > y1_2 or y0_2 > y1_1:
+        return False
+
+    return True
+
+
+def is_in_bbox(bbox1, bbox2):
+    """
+    This function checks if bbox1 is in bbox2
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 is in bbox2, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
+        return True
+    else:
+        return False
+
+
+def calculate_para_bbox(lines):
+    """
+    This function calculates the minimum bbox of the paragraph
+
+    Parameters
+    ----------
+    lines : list
+        lines
+
+    Returns
+    -------
+    para_bbox : list
+        bbox of the paragraph
+    """
+    x0 = min(line["bbox"][0] for line in lines)
+    y0 = min(line["bbox"][1] for line in lines)
+    x1 = max(line["bbox"][2] for line in lines)
+    y1 = max(line["bbox"][3] for line in lines)
+    return [x0, y0, x1, y1]
+
+
+def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is right aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is right aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    _, _, x1, _ = curr_line_bbox
+    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x1 - prev_x1) < horizontal_thres
+    elif direction == 1:
+        return abs(x1 - next_x1) < horizontal_thres
+    elif direction == 2:
+        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
+    else:
+        return False
+
+
+def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is left aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is left aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    x0, _, _, _ = curr_line_bbox
+    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x0 - prev_x0) < horizontal_thres
+    elif direction == 1:
+        return abs(x0 - next_x0) < horizontal_thres
+    elif direction == 2:
+        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
+    else:
+        return False
+
+
+def end_with_punctuation(line_text):
+    """
+    This function checks if the line ends with punctuation marks
+    """
+
+    english_end_puncs = [".", "?", "!"]
+    chinese_end_puncs = ["。", "？", "！"]
+    end_puncs = english_end_puncs + chinese_end_puncs
+
+    last_non_space_char = None
+    for ch in line_text[::-1]:
+        if not ch.isspace():
+            last_non_space_char = ch
+            break
+
+    if last_non_space_char is None:
+        return False
+
+    return last_non_space_char in end_puncs
+
+
+def is_nested_list(lst):
+    if isinstance(lst, list):
+        return any(isinstance(sub, list) for sub in lst)
+    return False
--- a/magic_pdf/para/denoise.py
+++ b/magic_pdf/para/denoise.py
+import math
+
+from collections import defaultdict
+from magic_pdf.para.commons import *
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class HeaderFooterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
+        """
+        This function gets the most common bboxes from the bboxes
+
+        Parameters
+        ----------
+        bboxes : list
+            bboxes
+        page_height : float
+            height of the page
+        position : str, optional
+            "top" or "bottom", by default "top"
+        threshold : float, optional
+            threshold, by default 0.25
+        num_bboxes : int, optional
+            number of bboxes to return, by default 3
+        min_frequency : int, optional
+            minimum frequency of the bbox, by default 2
+
+        Returns
+        -------
+        common_bboxes : list
+            common bboxes
+        """
+        # Filter bbox by position
+        if position == "top":
+            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
+        else:
+            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
+
+        # Find the most common bbox
+        bbox_count = defaultdict(int)
+        for bbox in filtered_bboxes:
+            bbox_count[tuple(bbox)] += 1
+
+        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
+        common_bboxes = [
+            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
+        ][:num_bboxes]
+        return common_bboxes
+
+    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
+        """
+        This function detects the header and footer of the document.
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
+            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
+
+        def is_single_line_block(block):
+            # Determine based on the width and height of the block
+            block_width = block["X1"] - block["X0"]
+            block_height = block["bbox"][3] - block["bbox"][1]
+
+            # If the height of the block is close to the average character height and the width is large, it is considered a single line
+            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
+
+        # Traverse all blocks in the document
+        single_preproc_blocks = 0
+        total_blocks = 0
+        single_preproc_blocks = 0
+
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        total_blocks += 1
+                        if is_single_line_block(block):
+                            single_preproc_blocks += 1
+
+        # If there are no blocks, skip the header and footer detection
+        if total_blocks == 0:
+            print("No blocks found. Skipping header/footer detection.")
+            return result_dict
+
+        # If most of the blocks are single-line, skip the header and footer detection
+        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
+            return result_dict
+
+        # Collect the bounding boxes of all blocks
+        all_bboxes = []
+        all_texts = []
+
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        all_bboxes.append(block["bbox"])
+
+        # Get the height of the page
+        page_height = max(bbox[3] for bbox in all_bboxes)
+
+        # Get the most common bbox lists for headers and footers
+        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
+        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
+
+        # Detect and mark headers and footers
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        bbox = block["bbox"]
+                        text = block["text"]
+
+                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
+                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
+
+                        block["is_header"] = int(is_header)
+                        block["is_footer"] = int(is_footer)
+
+        return result_dict
+
+
+class NonHorizontalTextProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def detect_non_horizontal_texts(self, result_dict):
+        """
+        This function detects watermarks and vertical margin notes in the document.
+
+        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
+        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
+
+        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
+        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
+
+
+        Parameters
+        ----------
+        result_dict : dict
+            The result dictionary.
+
+        Returns
+        -------
+        result_dict : dict
+            The updated result dictionary.
+        """
+        # Dictionary to store information about potential watermarks
+        potential_watermarks = {}
+        potential_margin_notes = {}
+
+        for page_id, page_content in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_id, block_data in page_content.items():
+                    if block_id.startswith("block_"):
+                        if "dir" in block_data:
+                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
+
+                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
+                            angle = abs(math.degrees(angle))
+
+                            if angle > 5 and angle < 85:  # Check if direction is watermarks
+                                if coordinates_text in potential_watermarks:
+                                    potential_watermarks[coordinates_text] += 1
+                                else:
+                                    potential_watermarks[coordinates_text] = 1
+
+                            if angle > 85 and angle < 105:  # Check if direction is vertical
+                                if coordinates_text in potential_margin_notes:
+                                    potential_margin_notes[coordinates_text] += 1  # Increment count
+                                else:
+                                    potential_margin_notes[coordinates_text] = 1  # Initialize count
+
+        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+        watermark_threshold = len(result_dict) // 2
+        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
+
+        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+        margin_note_threshold = len(result_dict) // 2
+        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
+
+        # Add watermark information to the result dictionary
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_id, block_data in blocks.items():
+                    coordinates_text = (block_data["bbox"], block_data["text"])
+                    if coordinates_text in watermarks:
+                        block_data["is_watermark"] = 1
+                    else:
+                        block_data["is_watermark"] = 0
+
+                    if coordinates_text in margin_notes:
+                        block_data["is_vertical_margin_note"] = 1
+                    else:
+                        block_data["is_vertical_margin_note"] = 0
+
+        return result_dict
+
+
+class NoiseRemover:
+    def __init__(self) -> None:
+        pass
+
+    def skip_data_noises(self, result_dict):
+        """
+        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
+        """
+        filtered_result_dict = {}
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                filtered_blocks = {}
+                for block_id, block in blocks.items():
+                    if block_id.startswith("block_"):
+                        if any(
+                            block.get(key, 0)
+                            for key in [
+                                "is_overlap",
+                                "is_header",
+                                "is_footer",
+                                "is_watermark",
+                                "is_vertical_margin_note",
+                                "is_block_title",
+                            ]
+                        ):
+                            continue
+                        filtered_blocks[block_id] = block
+                if filtered_blocks:
+                    filtered_result_dict[page_id] = filtered_blocks
+
+        return filtered_result_dict
--- a/magic_pdf/para/draw.py
+++ b/magic_pdf/para/draw.py
+from magic_pdf.libs.commons import fitz
+
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class DrawAnnos:
+    """
+    This class draws annotations on the pdf file
+
+    ----------------------------------------
+                Color Code
+    ----------------------------------------
+        Red: (1, 0, 0)
+        Green: (0, 1, 0)
+        Blue: (0, 0, 1)
+        Yellow: (1, 1, 0) - mix of red and green
+        Cyan: (0, 1, 1) - mix of green and blue
+        Magenta: (1, 0, 1) - mix of red and blue
+        White: (1, 1, 1) - red, green and blue full intensity
+        Black: (0, 0, 0) - no color component whatsoever
+        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
+        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_nested_list(self, lst):
+        """
+        This function returns True if the given list is a nested list of any degree.
+        """
+        if isinstance(lst, list):
+            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
+        return False
+
+    def __valid_rect(self, bbox):
+        # Ensure that the rectangle is not empty or invalid
+        if isinstance(bbox[0], list):
+            return False  # It's a nested list, hence it can't be valid rect
+        else:
+            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
+
+    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
+        """
+        This function draws the nested boxes
+
+        Parameters
+        ----------
+        page : fitz.Page
+            page
+        nested_bbox : list
+            nested bbox
+        color : tuple
+            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
+        """
+        if self.__is_nested_list(nested_bbox):  # If it's a nested list
+            for bbox in nested_bbox:
+                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
+        elif self.__valid_rect(nested_bbox):  # If valid rectangle
+            para_rect = fitz.Rect(nested_bbox)
+            para_anno = page.add_rect_annot(para_rect)
+            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
+            para_anno.set_border(width=1)
+            para_anno.update()
+
+    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
+        pdf_doc = open_pdf(input_pdf_path)
+
+        if pdf_dic is None:
+            pdf_dic = {}
+
+        if output_pdf_path is None:
+            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
+
+        for page_id, page in enumerate(pdf_doc):  # type: ignore
+            page_key = f"page_{page_id}"
+            for ele_key, ele_data in pdf_dic[page_key].items():
+                if ele_key == "para_blocks":
+                    para_blocks = ele_data
+                    for para_block in para_blocks:
+                        if "paras" in para_block.keys():
+                            paras = para_block["paras"]
+                            for para_key, para_content in paras.items():
+                                para_bbox = para_content["para_bbox"]
+                                # print(f"para_bbox: {para_bbox}")
+                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
+                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
+                                    color = (0, 1, 1)
+                                    self.__draw_nested_boxes(
+                                        page, para_bbox, color
+                                    )  # draw with cyan color for combined paragraph
+                                else:
+                                    if self.__valid_rect(para_bbox):
+                                        para_rect = fitz.Rect(para_bbox)
+                                        para_anno = page.add_rect_annot(para_rect)
+                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
+                                        para_anno.set_border(width=0.5)
+                                        para_anno.update()
+
+                                is_para_title = para_content["is_para_title"]
+                                if is_para_title:
+                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
+                                        color = (0, 0, 1)
+                                        self.__draw_nested_boxes(
+                                            page, para_content["para_bbox"], color
+                                        )  # draw with cyan color for combined title
+                                    else:
+                                        if self.__valid_rect(para_content["para_bbox"]):
+                                            para_rect = fitz.Rect(para_content["para_bbox"])
+                                            if self.__valid_rect(para_content["para_bbox"]):
+                                                para_anno = page.add_rect_annot(para_rect)
+                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
+                                                para_anno.set_border(width=0.5)
+                                                para_anno.update()
+
+        pdf_doc.save(output_pdf_path)
+        pdf_doc.close()
--- a/magic_pdf/para/exceptions.py
+++ b/magic_pdf/para/exceptions.py
+class DenseSingleLineBlockException(Exception):
+    """
+    This class defines the exception type for dense single line-block.
+    """
+
+    def __init__(self, message="DenseSingleLineBlockException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleDetectionException(Exception):
+    """
+    This class defines the exception type for title detection.
+    """
+
+    def __init__(self, message="TitleDetectionException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleLevelException(Exception):
+    """
+    This class defines the exception type for title level.
+    """
+
+    def __init__(self, message="TitleLevelException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaSplitException(Exception):
+    """
+    This class defines the exception type for paragraph splitting.
+    """
+
+    def __init__(self, message="ParaSplitException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaMergeException(Exception):
+    """
+    This class defines the exception type for paragraph merging.
+    """
+
+    def __init__(self, message="ParaMergeException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class DiscardByException:
+    """
+    This class discards pdf files by exception
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
+        """
+        This function discards pdf files by single line block exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        exception_page_nums = 0
+        page_num = 0
+        for page_id, page in pdf_dic.items():
+            if page_id.startswith("page_"):
+                page_num += 1
+                if "preproc_blocks" in page.keys():
+                    preproc_blocks = page["preproc_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in preproc_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
+                        exception_page_nums += 1
+
+        if page_num == 0:
+            return None
+
+        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
+            return exception.message
+
+        return None
+
+    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
+        """
+        This function discards pdf files by title detection exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
+        """
+        This function discards pdf files by title level exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
+        """
+        This function discards pdf files by split para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
+        """
+        This function discards pdf files by merge para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
--- a/magic_pdf/para/layout_match_processor.py
+++ b/magic_pdf/para/layout_match_processor.py
+import math
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class LayoutFilterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def batch_process_blocks(self, pdf_dict):
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
+                    layout_bbox_objs = blocks["layout_bboxes"]
+                    if layout_bbox_objs is None:
+                        continue
+                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
+
+                    # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
+                    layout_bboxes = [
+                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
+                    ]
+
+                    para_blocks = blocks["para_blocks"]
+                    if para_blocks is None:
+                        continue
+
+                    for lb_bbox in layout_bboxes:
+                        for i, para_block in enumerate(para_blocks):
+                            para_bbox = para_block["bbox"]
+                            para_blocks[i]["in_layout"] = 0
+                            if is_in_bbox(para_bbox, lb_bbox):
+                                para_blocks[i]["in_layout"] = 1
+
+                    blocks["para_blocks"] = para_blocks
+
+        return pdf_dict
--- a/magic_pdf/para/para_pipeline.py
+++ b/magic_pdf/para/para_pipeline.py
+import os
+import json
+
+from magic_pdf.para.commons import *
+
+from magic_pdf.para.raw_processor import RawBlockProcessor
+from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
+from magic_pdf.para.stats import BlockStatisticsCalculator
+from magic_pdf.para.stats import DocStatisticsCalculator
+from magic_pdf.para.title_processor import TitleProcessor
+from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
+from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
+from magic_pdf.para.draw import DrawAnnos
+from magic_pdf.para.exceptions import (
+    DenseSingleLineBlockException,
+    TitleDetectionException,
+    TitleLevelException,
+    ParaSplitException,
+    ParaMergeException,
+    DiscardByException,
+)
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class ParaProcessPipeline:
+    def __init__(self) -> None:
+        pass
+
+    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
+        """
+        This function processes the paragraphs, including:
+        1. Read raw input json file into pdf_dic
+        2. Detect and replace equations
+        3. Combine spans into a natural line
+        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        5. Compute statistics for each block
+        6. Detect titles in the document
+        7. Detect paragraphs inside each block
+        8. Divide the level of the titles
+        9. Detect and combine paragraphs from different blocks into one paragraph
+        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
+        11. Draw annotations on the pdf file
+
+        Parameters
+        ----------
+        pdf_dic_json_fpath : str
+            path to the pdf dictionary json file.
+            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
+        input_pdf_doc : str
+            path to the input pdf file
+        output_pdf_path : str
+            path to the output pdf file
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+
+        error_info = None
+
+        output_json_file = ""
+        output_dir = ""
+
+        if input_pdf_path is not None:
+            input_pdf_path = os.path.abspath(input_pdf_path)
+
+            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
+
+        if output_pdf_path is not None:
+            output_dir = os.path.dirname(output_pdf_path)
+            output_json_file = f"{output_dir}/pdf_dic.json"
+
+        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
+            """
+            Save the pdf_dic to a json file
+            """
+            output_pdf_file_name = os.path.basename(output_pdf_path)
+            # output_dir = os.path.dirname(output_pdf_path)
+            output_dir = "\\tmp\\pdf_parse"
+            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
+            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
+
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            if para_debug_mode == "full":
+                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
+                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
+
+            # Validate the output already exists
+            if not os.path.exists(pdf_dic_json_fpath):
+                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
+                return None
+            else:
+                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
+
+            return pdf_dic_json_fpath
+
+        """
+        Preprocess the lines of block
+        """
+        # Find and replace the interline and inline equations, should be better done before the paragraph processing
+        # Create "para_blocks" for each page.
+        # equationProcessor = EquationsProcessor()
+        # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)
+
+        # Combine spans into a natural line
+        rawBlockProcessor = RawBlockProcessor()
+        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        layoutFilter = LayoutFilterProcessor()
+        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
+
+        # Compute statistics for each block
+        blockStatisticsCalculator = BlockStatisticsCalculator()
+        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Compute statistics for all blocks(namely this pdf document)
+        docStatisticsCalculator = DocStatisticsCalculator()
+        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
+        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
+
+        # Dump the first three stages of pdf_dic to a json file
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
+
+        """
+        Detect titles in the document
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and divide the level of the titles
+        """
+        titleProcessor = TitleProcessor()
+
+        pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and split paragraphs inside each block
+        """
+        blockInnerParasProcessor = BlockTerminationProcessor()
+
+        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Detect and combine paragraphs from different blocks into one paragraph
+        """
+        blockContinuationProcessor = BlockContinuationProcessor()
+
+        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
+        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Discard pdf files by checking exceptions and return the error info to the caller
+        """
+        discardByException = DiscardByException()
+
+        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
+            pdf_dic, exception=DenseSingleLineBlockException()
+        )
+        is_discard_by_title_detection = discardByException.discard_by_title_detection(
+            pdf_dic, exception=TitleDetectionException()
+        )
+        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
+        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
+        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
+
+        """
+        if any(
+            info is not None
+            for info in [
+                is_discard_by_single_line_block,
+                is_discard_by_title_detection,
+                is_discard_by_title_level,
+                is_discard_by_split_para,
+                is_discard_by_merge_para,
+            ]
+        ):
+            error_info = next(
+                (
+                    info
+                    for info in [
+                        is_discard_by_single_line_block,
+                        is_discard_by_title_detection,
+                        is_discard_by_title_level,
+                        is_discard_by_split_para,
+                        is_discard_by_merge_para,
+                    ]
+                    if info is not None
+                ),
+                None,
+            )
+            return pdf_dic, error_info
+
+        if any(
+            info is not None
+            for info in [
+                is_discard_by_single_line_block,
+                is_discard_by_title_detection,
+                is_discard_by_title_level,
+                is_discard_by_split_para,
+                is_discard_by_merge_para,
+            ]
+        ):
+            error_info = next(
+                (
+                    info
+                    for info in [
+                        is_discard_by_single_line_block,
+                        is_discard_by_title_detection,
+                        is_discard_by_title_level,
+                        is_discard_by_split_para,
+                        is_discard_by_merge_para,
+                    ]
+                    if info is not None
+                ),
+                None,
+            )
+            return pdf_dic, error_info
+        """
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+
+        if is_discard_by_single_line_block is not None:
+            error_info = is_discard_by_single_line_block
+        elif is_discard_by_title_detection is not None:
+            error_info = is_discard_by_title_detection
+        elif is_discard_by_title_level is not None:
+            error_info = is_discard_by_title_level
+        elif is_discard_by_split_para is not None:
+            error_info = is_discard_by_split_para
+        elif is_discard_by_merge_para is not None:
+            error_info = is_discard_by_merge_para
+
+        if error_info is not None:
+            return pdf_dic, error_info
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+        if para_debug_mode is not None:
+            drawAnnos = DrawAnnos()
+            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
+
+        """
+        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
+        """
+        if para_debug_mode is not None:
+            for fpath in os.listdir(output_dir):
+                if fpath.endswith(".json") and "stage" in fpath:
+                    os.remove(os.path.join(output_dir, fpath))
+
+        return pdf_dic, error_info
--- a/magic_pdf/para/para_split.py
+++ b/magic_pdf/para/para_split.py
+from sklearn.cluster import DBSCAN
+import numpy as np
+from loguru import logger
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
+from magic_pdf.libs.ocr_content_type import ContentType
+
+
+LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？',"：", ":", ")", "）", ";"]
+INLINE_EQUATION = ContentType.InlineEquation
+INTERLINE_EQUATION = ContentType.InterlineEquation
+TEXT = ContentType.Text
+
+
+def __get_span_text(span):
+    c = span.get('content', '')
+    if len(c)==0:
+        c = span.get('image_path', '')
+        
+    return c
+    
+
+def __detect_list_lines(lines, new_layout_bboxes, lang):
+    """
+    探测是否包含了列表，并且把列表的行分开.
+    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
+    """
+    def find_repeating_patterns(lst):
+        indices = []
+        ones_indices = []
+        i = 0
+        while i < len(lst) - 1:  # 确保余下元素至少有2个
+            if lst[i] == 1 and lst[i+1] in [2, 3]:  # 额外检查以防止连续出现的1
+                start = i
+                ones_in_this_interval = [i]
+                i += 1
+                while i < len(lst) and lst[i] in [2, 3]:
+                    i += 1
+                # 验证下一个序列是否符合条件
+                if i < len(lst) - 1 and lst[i] == 1 and lst[i+1] in [2, 3] and lst[i-1] in [2, 3]:
+                    while i < len(lst) and lst[i] in [1, 2, 3]:
+                        if lst[i] == 1:
+                            ones_in_this_interval.append(i)
+                        i += 1
+                    indices.append((start, i - 1))
+                    ones_indices.append(ones_in_this_interval)
+                else:
+                    i += 1
+            else:
+                i += 1
+        return indices, ones_indices
+    """===================="""
+    def split_indices(slen, index_array):
+        result = []
+        last_end = 0
+        
+        for start, end in sorted(index_array):
+            if start > last_end:
+                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
+                result.append(('text', last_end, start - 1))
+            # 区间内标记为"list"
+            result.append(('list', start, end))
+            last_end = end + 1
+
+        if last_end < slen:
+            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
+            result.append(('text', last_end, slen - 1))
+
+        return result
+    """===================="""
+
+    if lang!='en':
+        return lines, None
+    else:
+        total_lines = len(lines)
+        line_fea_encode = []
+        """
+        对每一行进行特征编码，编码规则如下：
+        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
+        2. 如果顶格，其他非大写开头编码为4
+        3. 如果非顶格，首字符大写，编码为2
+        4. 如果非顶格，首字符非大写编码为3
+        """
+        for l in lines:
+            first_char = __get_span_text(l['spans'][0])[0]
+            layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0]
+            if l['bbox'][0] == layout_left:
+                if first_char.isupper() or first_char.isdigit():
+                    line_fea_encode.append(1)
+                else:
+                    line_fea_encode.append(4)
+            else:
+                if first_char.isupper():
+                    line_fea_encode.append(2)
+                else:
+                    line_fea_encode.append(3)
+                    
+        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
+        
+        list_indice, list_start_idx  = find_repeating_patterns(line_fea_encode)
+        if len(list_indice)>0:
+            logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+        
+        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
+        segments = []
+        for start, end in list_indice:
+            for i in range(start, end+1):
+                if i>0:
+                    if line_fea_encode[i] == 4:
+                        logger.info(f"列表行的第{i}行不是顶格的")
+                        break
+            else:
+                logger.info(f"列表行的第{start}到第{end}行是列表")
+        
+        return split_indices(total_lines, list_indice), list_start_idx
+        
+            
+
+def __valign_lines(blocks, layout_bboxes):
+    """
+    在一个layoutbox内对齐行的左侧和右侧。
+    扫描行的左侧和右侧，如果x0, x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
+    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。
+    """
+    
+    min_distance = 3
+    min_sample = 2
+    new_layout_bboxes = []
+    
+    for layout_box in layout_bboxes:
+        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], layout_box['layout_bbox'])]
+        if len(blocks_in_layoutbox)==0:
+            continue
+        
+        x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
+        x0_uniq_label = np.unique(x0_clusters.labels_)
+        x1_uniq_label = np.unique(x1_clusters.labels_)
+        
+        x0_2_new_val = {} # 存储旧值对应的新值映射
+        x1_2_new_val = {}
+        for label in x0_uniq_label:
+            if label==-1:
+                continue
+            x0_index_of_label = np.where(x0_clusters.labels_==label)
+            x0_raw_val = x0_lst[x0_index_of_label][:,0]
+            x0_new_val = np.min(x0_lst[x0_index_of_label][:,0])
+            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
+        for label in x1_uniq_label:
+            if label==-1:
+                continue
+            x1_index_of_label = np.where(x1_clusters.labels_==label)
+            x1_raw_val = x1_lst[x1_index_of_label][:,0]
+            x1_new_val = np.max(x1_lst[x1_index_of_label][:,0])
+            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
+        
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 in x0_2_new_val:
+                    line['bbox'][0] = int(x0_2_new_val[x0])
+
+                if x1 in x1_2_new_val:
+                    line['bbox'][2] = int(x1_2_new_val[x1])
+            # 其余对不齐的保持不动
+            
+        # 由于修改了block里的line长度，现在需要重新计算block的bbox
+        for block in blocks_in_layoutbox:
+            block['bbox'] = [min([line['bbox'][0] for line in block['lines']]), 
+                            min([line['bbox'][1] for line in block['lines']]), 
+                            max([line['bbox'][2] for line in block['lines']]), 
+                            max([line['bbox'][3] for line in block['lines']])]
+            
+        """新计算layout的bbox，因为block的bbox变了。"""
+        layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
+        layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
+        layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
+        layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
+        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
+            
+    return new_layout_bboxes
+
+
+def __align_text_in_layout(blocks, layout_bboxes):
+    """
+    由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。
+    """
+    for layout in layout_bboxes:
+        lb = layout['layout_bbox']
+        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
+        if len(blocks_in_layoutbox)==0:
+            continue
+        
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 < lb[0]:
+                    line['bbox'][0] = lb[0]
+                if x1 > lb[2]:
+                    line['bbox'][2] = lb[2]
+    
+ 
+def __common_pre_proc(blocks, layout_bboxes):
+    """
+    不分语言的，对文本进行预处理
+    """
+    #__add_line_period(blocks, layout_bboxes)
+    __align_text_in_layout(blocks, layout_bboxes)
+    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
+    
+    return aligned_layout_bboxes
+
+def __pre_proc_zh_blocks(blocks, layout_bboxes):
+    """
+    对中文文本进行分段预处理
+    """
+    pass
+
+
+def __pre_proc_en_blocks(blocks, layout_bboxes):
+    """
+    对英文文本进行分段预处理
+    """
+    pass
+
+
+def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
+    """
+    每个layout内的行进行聚合
+    """
+    # 因为只是一个block一行目前, 一个block就是一个段落
+    lines_group = []
+    
+    for lyout in layout_bboxes:
+        lines = [line for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox']) for line in block['lines']]
+        lines_group.append(lines)
+
+    return lines_group
+    
+
+def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_len=10):
+    """
+    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
+    1. 先计算每个group的左右边界。
+    2. 然后根据行末尾特征进行分段。
+        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
+        且下一行开头不留空白。
+    
+    """
+    list_info = [] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+    layout_paras = []
+    right_tail_distance = 1.5 * char_avg_len
+    
+    for lines in lines_group:
+        paras = []
+        total_lines = len(lines)
+        if total_lines==0:
+            continue # 0行无需处理
+        if total_lines==1: # 1行无法分段。
+            layout_paras.append([lines])
+            list_info.append([False, False])
+            continue
+        
+        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
+            对齐方式分为以下：
+            1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
+                1) 右侧对齐的行，单独成一段
+                2) 中间对齐的行，按照字体/行高聚合成一段
+            2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
+                这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
+        """
+        
+        text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang)
+        """根据list_range，把lines分成几个部分
+        
+        """
+        
+        layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
+        layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
+        para = [] # 元素是line
+        layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+        for content_type, start, end in text_segments:
+            if content_type == 'list':
+                for i, line in enumerate(lines[start:end+1]):
+                    line_x0 = line['bbox'][0]
+                    if line_x0 == layout_left: # 列表开头
+                        if len(para)>0:
+                            paras.append(para)
+                            para = []
+                        para.append(line)
+                    else:
+                        para.append(line)
+                if len(para)>0:
+                    paras.append(para)
+                    para = []
+                if start==0:
+                    layout_list_info[0] = True
+                if end==total_lines-1:
+                    layout_list_info[1] = True
+            else: # 是普通文本
+                for i, line in enumerate(lines[start:end+1]):
+                    # 如果i有下一行，那么就要根据下一行位置综合判断是否要分段。如果i之后没有行，那么只需要判断i行自己的结尾特征。
+                    cur_line_type = line['spans'][-1]['type']
+                    next_line = lines[i+1] if i<total_lines-1 else None
+                    
+                    if cur_line_type in [TEXT, INLINE_EQUATION]:
+                        if line['bbox'][2] < layout_right - right_tail_distance:
+                            para.append(line)
+                            paras.append(para)
+                            para = []
+                        elif line['bbox'][2] >= layout_right - right_tail_distance and next_line and next_line['bbox'][0] == layout_left: # 现在这行到了行尾沾满，下一行存在且顶格。
+                            para.append(line)
+                        else: 
+                            para.append(line)
+                            paras.append(para)
+                            para = []
+                    else: # 其他，图片、表格、行间公式，各自占一段
+                        if len(para)>0:  # 先把之前的段落加入到结果中
+                            paras.append(para)
+                            para = []
+                        paras.append([line]) # 再把当前行加入到结果中。当前行为行间公式、图、表等。
+                        para = []
+                        
+                if len(para)>0:
+                    paras.append(para)
+                    para = []
+                
+        list_info.append(layout_list_info)
+        layout_paras.append(paras)
+        paras = []
+                
+                    
+    return layout_paras, list_info
+
+def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang):
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(layout_paras)==0 or len(layout_list_info)==0: # 0的时候最后的return 会出错
+        return layout_paras, [False, False]
+        
+    for i in range(1, len(layout_paras)):
+        pre_layout_list_info = layout_list_info[i-1]
+        next_layout_list_info = layout_list_info[i]
+        pre_last_para = layout_paras[i-1][-1]
+        next_paras = layout_paras[i]
+        next_first_para = next_paras[0]
+        
+        if pre_layout_list_info[1] and not next_layout_list_info[0]: # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+            logger.info(f"连接page {page_num} 内的list")
+            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+            may_list_lines = []
+            for j in range(len(next_paras)):
+                line = next_paras[j]
+                if len(line)==1: # 只可能是一行，多行情况再需要分析了
+                    if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], new_layout_bbox)[0]:
+                        may_list_lines.append(line[0])
+                    else:
+                        break
+                else:
+                    break
+            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+            if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1:
+                pre_last_para.extend(may_list_lines)
+                layout_paras[i] = layout_paras[i][len(may_list_lines):]
+                           
+    return layout_paras, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
+
+
+def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox,  pre_page_list_info, next_page_list_info, page_num, lang):
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(pre_page_paras)==0 or len(next_page_paras)==0: # 0的时候最后的return 会出错
+        return False
+    
+    if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+        logger.info(f"连接page {page_num} 内的list")
+        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+        may_list_lines = []
+        for j in range(len(next_page_paras[0])):
+            line = next_page_paras[0][j]
+            if len(line)==1: # 只可能是一行，多行情况再需要分析了
+                if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], next_page_layout_bbox)[0]:
+                    may_list_lines.append(line[0])
+                else:
+                    break
+            else:
+                break
+        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+        if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1:
+            pre_page_paras[-1].append(may_list_lines)
+            next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
+            return True
+                       
+    return False
+
+
+def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
+    """
+    根据line找到所在的layout
+    """
+    for layout in layout_bboxes:
+        if is_in_layout(line_bbox, layout):
+            return layout
+    return None
+
+
+def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
+    """
+    layout之间进行分段。
+    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
+    连接的条件需要同时满足：
+    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
+    2. 下一行开头不留空白。
+
+    """
+    connected_layout_paras = []
+    if len(layout_paras)==0:
+        return connected_layout_paras
+    
+    connected_layout_paras.append(layout_paras[0])
+    for i in range(1, len(layout_paras)):
+        try:
+            if len(layout_paras[i])==0 or len(layout_paras[i-1])==0: #  TODO 考虑连接问题，
+                continue
+            pre_last_line = layout_paras[i-1][-1][-1]
+            next_first_line = layout_paras[i][0][0]
+        except Exception as e:
+            logger.error(f"page layout {i} has no line")
+            continue
+        pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+        pre_last_line_type = pre_last_line['spans'][-1]['type']
+        next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+        next_first_line_type = next_first_line['spans'][0]['type']
+        if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
+            connected_layout_paras.append(layout_paras[i])
+            continue
+        
+        pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[2]
+        next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)[0]
+        
+        pre_last_line_text = pre_last_line_text.strip()
+        next_first_line_text = next_first_line_text.strip()
+        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+            connected_layout_paras[-1][-1].extend(layout_paras[i][0])
+            layout_paras[i].pop(0) # 删除后一个layout的第一个段落， 因为他已经被合并到前一个layout的最后一个段落了。
+            if len(layout_paras[i])==0:
+                layout_paras.pop(i)
+            else:
+                connected_layout_paras.append(layout_paras[i])
+        else:                            
+            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
+            connected_layout_paras.append(layout_paras[i])
+    
+    return connected_layout_paras
+
+
+def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang):
+    """
+    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
+    是否可以连接的条件：
+    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
+    2. 后一个页面的第一个段落第一行没有空白开头。
+    """
+    # 有的页面可能压根没有文字
+    if len(pre_page_paras)==0 or len(next_page_paras)==0 or len(pre_page_paras[0])==0 or len(next_page_paras[0])==0: # TODO [[]]为什么出现在pre_page_paras里？
+        return False
+    pre_last_para = pre_page_paras[-1][-1]
+    next_first_para = next_page_paras[0][0]
+    pre_last_line = pre_last_para[-1]
+    next_first_line = next_first_para[0]
+    pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+    pre_last_line_type = pre_last_line['spans'][-1]['type']
+    next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+    next_first_line_type = next_first_line['spans'][0]['type']
+    
+    if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]: # TODO，真的要做好，要考虑跨table, image, 行间的情况
+        # 不是文本，不连接
+        return False
+    
+    pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)[2]
+    next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)[0]
+    
+    pre_last_line_text = pre_last_line_text.strip()
+    next_first_line_text = next_first_line_text.strip()
+    if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+        pre_last_para.extend(next_first_para)
+        next_page_paras[0].pop(0) # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
+        return True
+    else:
+        return False
+
+def find_consecutive_true_regions(input_array):
+    start_index = None  # 连续True区域的起始索引
+    regions = []  # 用于保存所有连续True区域的起始和结束索引
+
+    for i in range(len(input_array)):
+        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
+        if input_array[i] and start_index is None:
+            start_index = i  # 记录连续True区域的起始索引
+
+        # 如果我们找到了一个False值，并且当前在连续True区域中
+        elif not input_array[i] and start_index is not None:
+            # 如果连续True区域长度大于1，那么将其添加到结果列表中
+            if i - start_index > 1: 
+                regions.append((start_index, i-1)) 
+            start_index = None  # 重置起始索引
+
+    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
+    if start_index is not None and len(input_array) - start_index > 1:
+        regions.append((start_index, len(input_array)-1))
+
+    return regions
+
+
+def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode):
+    """
+    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
+    一个line居中的条件是：
+    1. 水平中心点跨越layout的中心点。
+    2. 左右两侧都有空白
+    """
+    
+    for layout_i, layout_para in enumerate(page_paras):
+        layout_box = new_layout_bbox[layout_i]
+        single_line_paras_tag = []
+        for i in range(len(layout_para)):
+            single_line_paras_tag.append(len(layout_para[i])==1 and layout_para[i][0]['spans'][0]['type']==TEXT)
+            
+        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
+        consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
+        if len(consecutive_single_line_indices)>0:
+            index_offset = 0
+            """检查这些行是否是高度相同的，居中的"""
+            for start, end in consecutive_single_line_indices:
+                start += index_offset
+                end += index_offset
+                line_hi = np.array([line[0]['bbox'][3]-line[0]['bbox'][1] for line in layout_para[start:end+1]])
+                first_line_text = ''.join([__get_span_text(span) for span in layout_para[start][0]['spans']])
+                if "Table" in first_line_text or "Figure" in first_line_text:
+                    pass
+                if debug_mode:
+                    logger.debug(line_hi.std())
+                
+                if line_hi.std()<2:
+                    """行高度相同，那么判断是否居中"""
+                    all_left_x0 = [line[0]['bbox'][0] for line in layout_para[start:end+1]]
+                    all_right_x1 = [line[0]['bbox'][2] for line in layout_para[start:end+1]]
+                    layout_center = (layout_box[0] + layout_box[2]) / 2
+                    if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \
+                    and not all([x0==layout_box[0] for x0 in all_left_x0]) \
+                    and not all([x1==layout_box[2] for x1 in all_right_x1]):
+                        merge_para = [l[0] for l in layout_para[start:end+1]]
+                        para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
+                        if debug_mode:
+                            logger.debug(para_text)
+                        layout_para[start:end+1] = [merge_para]
+                        index_offset -= end-start
+                        
+    return
+            
+
+def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
+    """
+    找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。
+    """
+    
+    pass
+
+
+def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
+    """
+    根据line和layout情况进行分段
+    先实现一个根据行末尾特征分段的简单方法。
+    """
+    """
+    算法思路：
+    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
+    2. 从上述行中找到末尾是句号等可作为断行标志的行。
+    3. 参照上述行尾特征进行分段。
+    4. 图、表，目前独占一行，不考虑分段。
+    """
+    if page_num==343:
+        pass
+    lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
+    layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
+    layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落
+    connected_layout_paras = __connect_para_inter_layoutbox(layout_paras2, new_layout_bbox, lang) # layout间链接段落
+    
+    
+    return connected_layout_paras, page_list_info
+       
+    
+def para_split(pdf_info_dict, debug_mode, lang="en"):
+    """
+    根据line和layout情况进行分段
+    """
+    new_layout_of_pages = [] # 数组的数组，每个元素是一个页面的layoutS
+    all_page_list_info = [] # 保存每个页面开头和结尾是否是列表
+    for page_num, page in pdf_info_dict.items():
+        blocks = page['preproc_blocks']
+        layout_bboxes = page['layout_bboxes']
+        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
+        new_layout_of_pages.append(new_layout_bbox)
+        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
+        all_page_list_info.append(page_list_info)
+        page['para_blocks'] = splited_blocks
+        
+    """连接页面与页面之间的可能合并的段落"""
+    pdf_infos = list(pdf_info_dict.values())
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        if page_num==0:
+            continue
+        pre_page_paras = pdf_infos[page_num-1]['para_blocks']
+        next_page_paras = pdf_infos[page_num]['para_blocks']
+        pre_page_layout_bbox = new_layout_of_pages[page_num-1]
+        next_page_layout_bbox = new_layout_of_pages[page_num]
+        
+        is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang)
+        if debug_mode:
+            if is_conn:
+                logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
+            
+        is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang)
+        if debug_mode:
+            if is_list_conn:
+                logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
+            
+    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
+    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
+    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
+    """
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        page_paras = page['para_blocks']
+        new_layout_bbox = new_layout_of_pages[page_num]
+        __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode)
+        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
--- a/magic_pdf/para/para_split_v2.py
+++ b/magic_pdf/para/para_split_v2.py
+import copy
+
+from sklearn.cluster import DBSCAN
+import numpy as np
+from loguru import logger
+import re
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.libs.Constants import *
+
+LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？', "：", ":", ")", "）", ";"]
+INLINE_EQUATION = ContentType.InlineEquation
+INTERLINE_EQUATION = ContentType.InterlineEquation
+TEXT = ContentType.Text
+debug_able = False
+
+
+def __get_span_text(span):
+    c = span.get('content', '')
+    if len(c) == 0:
+        c = span.get('image_path', '')
+
+    return c
+
+
+def __detect_list_lines(lines, new_layout_bboxes, lang):
+    global debug_able
+    """
+    探测是否包含了列表，并且把列表的行分开.
+    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
+    """
+
+    def find_repeating_patterns2(lst):
+        indices = []
+        ones_indices = []
+        i = 0
+        while i < len(lst):  # Loop through the entire list
+            if lst[i] == 1:  # If we encounter a '1', we might be at the start of a pattern
+                start = i
+                ones_in_this_interval = [i]
+                i += 1
+                # Traverse elements that are 1, 2 or 3, until we encounter something else
+                while i < len(lst) and lst[i] in [1, 2, 3]:
+                    if lst[i] == 1:
+                        ones_in_this_interval.append(i)
+                    i += 1
+                if len(ones_in_this_interval) > 1 or (
+                        start < len(lst) - 1 and ones_in_this_interval and lst[start + 1] in [2, 3]):
+                    indices.append((start, i - 1))
+                    ones_indices.append(ones_in_this_interval)
+            else:
+                i += 1
+        return indices, ones_indices
+
+    def find_repeating_patterns(lst):
+        indices = []
+        ones_indices = []
+        i = 0
+        while i < len(lst) - 1:  # 确保余下元素至少有2个
+            if lst[i] == 1 and lst[i + 1] in [2, 3]:  # 额外检查以防止连续出现的1
+                start = i
+                ones_in_this_interval = [i]
+                i += 1
+                while i < len(lst) and lst[i] in [2, 3]:
+                    i += 1
+                # 验证下一个序列是否符合条件
+                if i < len(lst) - 1 and lst[i] == 1 and lst[i + 1] in [2, 3] and lst[i - 1] in [2, 3]:
+                    while i < len(lst) and lst[i] in [1, 2, 3]:
+                        if lst[i] == 1:
+                            ones_in_this_interval.append(i)
+                        i += 1
+                    indices.append((start, i - 1))
+                    ones_indices.append(ones_in_this_interval)
+                else:
+                    i += 1
+            else:
+                i += 1
+        return indices, ones_indices
+
+    """===================="""
+
+    def split_indices(slen, index_array):
+        result = []
+        last_end = 0
+
+        for start, end in sorted(index_array):
+            if start > last_end:
+                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
+                result.append(('text', last_end, start - 1))
+            # 区间内标记为"list"
+            result.append(('list', start, end))
+            last_end = end + 1
+
+        if last_end < slen:
+            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
+            result.append(('text', last_end, slen - 1))
+
+        return result
+
+    """===================="""
+
+    if lang != 'en':
+        return lines, None
+
+    total_lines = len(lines)
+    line_fea_encode = []
+    """
+    对每一行进行特征编码，编码规则如下：
+    1. 如果行顶格，且大写字母开头或者数字开头，编码为1
+    2. 如果顶格，其他非大写开头编码为4
+    3. 如果非顶格，首字符大写，编码为2
+    4. 如果非顶格，首字符非大写编码为3
+    """
+    if len(lines) > 0:
+        x_map_tag_dict, min_x_tag = cluster_line_x(lines)
+    for l in lines:
+        span_text = __get_span_text(l['spans'][0])
+        if not span_text:
+            line_fea_encode.append(0)
+            continue
+        first_char = span_text[0]
+        layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
+        if not layout:
+            line_fea_encode.append(0)
+        else:
+            #
+            if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
+                # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
+                if not first_char.isalnum() or if_match_reference_list(span_text):
+                    line_fea_encode.append(1)
+                else:
+                    line_fea_encode.append(4)
+            else:
+                if first_char.isupper():
+                    line_fea_encode.append(2)
+                else:
+                    line_fea_encode.append(3)
+
+    # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
+
+    list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
+    if len(list_indice) > 0:
+        if debug_able:
+            logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+
+    # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
+    segments = []
+    for start, end in list_indice:
+        for i in range(start, end + 1):
+            if i > 0:
+                if line_fea_encode[i] == 4:
+                    if debug_able:
+                        logger.info(f"列表行的第{i}行不是顶格的")
+                    break
+        else:
+            if debug_able:
+                logger.info(f"列表行的第{start}到第{end}行是列表")
+
+    return split_indices(total_lines, list_indice), list_start_idx
+
+
+def cluster_line_x(lines: list) -> dict:
+    """
+    对一个block内所有lines的bbox的x0聚类
+    """
+    min_distance = 5
+    min_sample = 1
+    x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
+    x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+    x0_uniq_label = np.unique(x0_clusters.labels_)
+    # x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
+    x0_2_new_val = {}  # 存储旧值对应的新值映射
+    min_x0 = round(lines[0]["bbox"][0])
+    for label in x0_uniq_label:
+        if label == -1:
+            continue
+        x0_index_of_label = np.where(x0_clusters.labels_ == label)
+        x0_raw_val = x0_lst[x0_index_of_label][:, 0]
+        x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
+        x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
+        if x0_new_val < min_x0:
+            min_x0 = x0_new_val
+    return x0_2_new_val, min_x0
+
+
+def if_match_reference_list(text: str) -> bool:
+    pattern = re.compile(r'^\d+\..*')
+    if pattern.match(text):
+        return True
+    else:
+        return False
+
+
+def __valign_lines(blocks, layout_bboxes):
+    """
+    在一个layoutbox内对齐行的左侧和右侧。
+    扫描行的左侧和右侧，如果x0, x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
+    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。
+    """
+
+    min_distance = 3
+    min_sample = 2
+    new_layout_bboxes = []
+    # add bbox_fs for para split calculation
+    for block in blocks:
+        block["bbox_fs"] = copy.deepcopy(block["bbox"])
+    for layout_box in layout_bboxes:
+        blocks_in_layoutbox = [b for b in blocks if
+                               b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
+        if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
+            new_layout_bboxes.append(layout_box['layout_bbox'])
+            continue
+
+        x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
+        x0_uniq_label = np.unique(x0_clusters.labels_)
+        x1_uniq_label = np.unique(x1_clusters.labels_)
+
+        x0_2_new_val = {}  # 存储旧值对应的新值映射
+        x1_2_new_val = {}
+        for label in x0_uniq_label:
+            if label == -1:
+                continue
+            x0_index_of_label = np.where(x0_clusters.labels_ == label)
+            x0_raw_val = x0_lst[x0_index_of_label][:, 0]
+            x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
+            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
+        for label in x1_uniq_label:
+            if label == -1:
+                continue
+            x1_index_of_label = np.where(x1_clusters.labels_ == label)
+            x1_raw_val = x1_lst[x1_index_of_label][:, 0]
+            x1_new_val = np.max(x1_lst[x1_index_of_label][:, 0])
+            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
+
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 in x0_2_new_val:
+                    line['bbox'][0] = int(x0_2_new_val[x0])
+
+                if x1 in x1_2_new_val:
+                    line['bbox'][2] = int(x1_2_new_val[x1])
+            # 其余对不齐的保持不动
+
+        # 由于修改了block里的line长度，现在需要重新计算block的bbox
+        for block in blocks_in_layoutbox:
+            if len(block["lines"]) > 0:
+                block['bbox_fs'] = [min([line['bbox'][0] for line in block['lines']]),
+                                    min([line['bbox'][1] for line in block['lines']]),
+                                    max([line['bbox'][2] for line in block['lines']]),
+                                    max([line['bbox'][3] for line in block['lines']])]
+        """新计算layout的bbox，因为block的bbox变了。"""
+        layout_x0 = min([block['bbox_fs'][0] for block in blocks_in_layoutbox])
+        layout_y0 = min([block['bbox_fs'][1] for block in blocks_in_layoutbox])
+        layout_x1 = max([block['bbox_fs'][2] for block in blocks_in_layoutbox])
+        layout_y1 = max([block['bbox_fs'][3] for block in blocks_in_layoutbox])
+        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
+
+    return new_layout_bboxes
+
+
+def __align_text_in_layout(blocks, layout_bboxes):
+    """
+    由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。
+    """
+    for layout in layout_bboxes:
+        lb = layout['layout_bbox']
+        blocks_in_layoutbox = [block for block in blocks if
+                               block["type"] == BlockType.Text and is_in_layout(block['bbox'], lb)]
+        if len(blocks_in_layoutbox) == 0:
+            continue
+
+        for block in blocks_in_layoutbox:
+            for line in block.get("lines", []):
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 < lb[0]:
+                    line['bbox'][0] = lb[0]
+                if x1 > lb[2]:
+                    line['bbox'][2] = lb[2]
+
+
+def __common_pre_proc(blocks, layout_bboxes):
+    """
+    不分语言的，对文本进行预处理
+    """
+    # __add_line_period(blocks, layout_bboxes)
+    __align_text_in_layout(blocks, layout_bboxes)
+    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
+
+    return aligned_layout_bboxes
+
+
+def __pre_proc_zh_blocks(blocks, layout_bboxes):
+    """
+    对中文文本进行分段预处理
+    """
+    pass
+
+
+def __pre_proc_en_blocks(blocks, layout_bboxes):
+    """
+    对英文文本进行分段预处理
+    """
+    pass
+
+
+def __group_line_by_layout(blocks, layout_bboxes):
+    """
+    每个layout内的行进行聚合
+    """
+    # 因为只是一个block一行目前, 一个block就是一个段落
+    blocks_group = []
+    for lyout in layout_bboxes:
+        blocks_in_layout = [block for block in blocks if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])]
+        blocks_group.append(blocks_in_layout)
+    return blocks_group
+
+
+def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
+    """
+    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
+    1. 先计算每个group的左右边界。
+    2. 然后根据行末尾特征进行分段。
+        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
+        且下一行开头不留空白。
+
+    """
+    list_info = []  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+    for blocks in blocks_group:
+        is_start_list = None
+        is_end_list = None
+        if len(blocks) == 0:
+            list_info.append([False, False])
+            continue
+        if blocks[0]["type"] != BlockType.Text and blocks[-1]["type"] != BlockType.Text:
+            list_info.append([False, False])
+            continue
+        if blocks[0]["type"] != BlockType.Text:
+            is_start_list = False
+        if blocks[-1]["type"] != BlockType.Text:
+            is_end_list = False
+
+        lines = [line for block in blocks if
+                 block["type"] == BlockType.Text for line in
+                 block['lines']]
+        total_lines = len(lines)
+        if total_lines == 1 or total_lines == 0:
+            list_info.append([False, False])
+            continue
+        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
+                    对齐方式分为以下：
+                    1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
+                        1) 右侧对齐的行，单独成一段
+                        2) 中间对齐的行，按照字体/行高聚合成一段
+                    2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
+                        这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
+        """
+        text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang)
+        """根据list_range，把lines分成几个部分
+
+        """
+        for list_start in list_start_line:
+            if len(list_start) > 1:
+                for i in range(0, len(list_start)):
+                    index = list_start[i] - 1
+                    if index >= 0:
+                        if "content" in lines[index]["spans"][-1] and lines[index]["spans"][-1].get('type', '') not in [
+                            ContentType.InlineEquation, ContentType.InterlineEquation]:
+                            lines[index]["spans"][-1]["content"] += '\n\n'
+        layout_list_info = [False, False]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+        for content_type, start, end in text_segments:
+            if content_type == 'list':
+                if start == 0 and is_start_list is None:
+                    layout_list_info[0] = True
+                if end == total_lines - 1 and is_end_list is None:
+                    layout_list_info[1] = True
+
+        list_info.append(layout_list_info)
+    return list_info
+
+
+def __split_para_lines(lines: list, text_blocks: list) -> list:
+    text_paras = []
+    other_paras = []
+    text_lines = []
+    for line in lines:
+
+        spans_types = [span["type"] for span in line]
+        if ContentType.Table in spans_types:
+            other_paras.append([line])
+            continue
+        if ContentType.Image in spans_types:
+            other_paras.append([line])
+            continue
+        if ContentType.InterlineEquation in spans_types:
+            other_paras.append([line])
+            continue
+        text_lines.append(line)
+
+    for block in text_blocks:
+        block_bbox = block["bbox"]
+        para = []
+        for line in text_lines:
+            bbox = line["bbox"]
+            if is_in_layout(bbox, block_bbox):
+                para.append(line)
+        if len(para) > 0:
+            text_paras.append(para)
+    paras = other_paras.extend(text_paras)
+    paras_sorted = sorted(paras, key=lambda x: x[0]["bbox"][1])
+    return paras_sorted
+
+
+def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, page_num, lang):
+    global debug_able
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(blocks_group) == 0 or len(blocks_group) == 0:  # 0的时候最后的return 会出错
+        return blocks_group, [False, False]
+
+    for i in range(1, len(blocks_group)):
+        if len(blocks_group[i]) == 0 or len(blocks_group[i - 1]) == 0:
+            continue
+        pre_layout_list_info = layout_list_info[i - 1]
+        next_layout_list_info = layout_list_info[i]
+        pre_last_para = blocks_group[i - 1][-1].get("lines", [])
+        next_paras = blocks_group[i]
+        next_first_para = next_paras[0]
+
+        if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[
+            "type"] == BlockType.Text:  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+            if debug_able:
+                logger.info(f"连接page {page_num} 内的list")
+            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+            may_list_lines = []
+            lines = next_first_para.get("lines", [])
+
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
+                    may_list_lines.append(line)
+                else:
+                    break
+            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+            if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
+                pre_last_para.extend(may_list_lines)
+                next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
+
+    return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
+
+
+def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox,
+                              pre_page_list_info, next_page_list_info, page_num, lang):
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(pre_page_paras) == 0 or len(next_page_paras) == 0:  # 0的时候最后的return 会出错
+        return False
+    if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
+        return False
+    if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
+        return False
+    if pre_page_list_info[1] and not next_page_list_info[0]:  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+        if debug_able:
+            logger.info(f"连接page {page_num} 内的list")
+        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+        may_list_lines = []
+        next_page_first_para = next_page_paras[0][0]
+        if next_page_first_para["type"] == BlockType.Text:
+            lines = next_page_first_para["lines"]
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
+                    may_list_lines.append(line)
+                else:
+                    break
+        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+        if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
+            # pre_page_paras[-1].append(may_list_lines)
+            # 下一页合并到上一页最后一段，打一个cross_page的标签
+            for line in may_list_lines:
+                for span in line["spans"]:
+                    span[CROSS_PAGE] = True
+            pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
+            next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
+            return True
+
+    return False
+
+
+def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
+    """
+    根据line找到所在的layout
+    """
+    for layout in layout_bboxes:
+        if is_in_layout(line_bbox, layout):
+            return layout
+    return None
+
+
+def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
+    """
+    layout之间进行分段。
+    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
+    连接的条件需要同时满足：
+    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
+    2. 下一行开头不留空白。
+
+    """
+    connected_layout_blocks = []
+    if len(blocks_group) == 0:
+        return connected_layout_blocks
+
+    connected_layout_blocks.append(blocks_group[0])
+    for i in range(1, len(blocks_group)):
+        try:
+            if len(blocks_group[i]) == 0:
+                continue
+            if len(blocks_group[i - 1]) == 0:  # TODO 考虑连接问题，
+                connected_layout_blocks.append(blocks_group[i])
+                continue
+            # text类型的段才需要考虑layout间的合并
+            if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
+                connected_layout_blocks.append(blocks_group[i])
+                continue
+            if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
+                connected_layout_blocks.append(blocks_group[i])
+                continue
+            pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
+            next_first_line = blocks_group[i][0]["lines"][0]
+        except Exception as e:
+            logger.error(f"page layout {i} has no line")
+            continue
+        pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+        pre_last_line_type = pre_last_line['spans'][-1]['type']
+        next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+        next_first_line_type = next_first_line['spans'][0]['type']
+        if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
+            connected_layout_blocks.append(blocks_group[i])
+            continue
+        pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
+        next_layout = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)
+
+        pre_x2_max = pre_layout[2] if pre_layout else -1
+        next_x0_min = next_layout[0] if next_layout else -1
+
+        pre_last_line_text = pre_last_line_text.strip()
+        next_first_line_text = next_first_line_text.strip()
+        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[
+            -1] not in LINE_STOP_FLAG and \
+                next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+            connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
+            blocks_group[i][0]["lines"] = []  # 删除后一个layout第一个段落中的lines，因为他已经被合并到前一个layout的最后一个段落了
+            blocks_group[i][0][LINES_DELETED] = True
+            # if len(layout_paras[i]) == 0:
+            #     layout_paras.pop(i)
+            # else:
+            #     connected_layout_paras.append(layout_paras[i])
+            connected_layout_blocks.append(blocks_group[i])
+        else:
+            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
+            connected_layout_blocks.append(blocks_group[i])
+    return connected_layout_blocks
+
+
+def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num,
+                              lang):
+    """
+    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
+    是否可以连接的条件：
+    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
+    2. 后一个页面的第一个段落第一行没有空白开头。
+    """
+    # 有的页面可能压根没有文字
+    if len(pre_page_paras) == 0 or len(next_page_paras) == 0 or len(pre_page_paras[0]) == 0 or len(
+            next_page_paras[0]) == 0:  # TODO [[]]为什么出现在pre_page_paras里？
+        return False
+    pre_last_block = pre_page_paras[-1][-1]
+    next_first_block = next_page_paras[0][0]
+    if pre_last_block["type"] != BlockType.Text or next_first_block["type"] != BlockType.Text:
+        return False
+    if len(pre_last_block["lines"]) == 0 or len(next_first_block["lines"]) == 0:
+        return False
+    pre_last_para = pre_last_block["lines"]
+    next_first_para = next_first_block["lines"]
+    pre_last_line = pre_last_para[-1]
+    next_first_line = next_first_para[0]
+    pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+    pre_last_line_type = pre_last_line['spans'][-1]['type']
+    next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+    next_first_line_type = next_first_line['spans'][0]['type']
+
+    if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT,
+                                                                                         INLINE_EQUATION]:  # TODO，真的要做好，要考虑跨table, image, 行间的情况
+        # 不是文本，不连接
+        return False
+
+    pre_x2_max_bbox = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)
+    if not pre_x2_max_bbox:
+        return False
+    next_x0_min_bbox = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)
+    if not next_x0_min_bbox:
+        return False
+
+    pre_x2_max = pre_x2_max_bbox[2]
+    next_x0_min = next_x0_min_bbox[0]
+
+    pre_last_line_text = pre_last_line_text.strip()
+    next_first_line_text = next_first_line_text.strip()
+    if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
+            next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+        # 下一页合并到上一页最后一段，打一个cross_page的标签
+        for line in next_first_para:
+            for span in line["spans"]:
+                span[CROSS_PAGE] = True
+        pre_last_para.extend(next_first_para)
+
+        # next_page_paras[0].pop(0)  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
+        next_page_paras[0][0]["lines"] = []
+        next_page_paras[0][0][LINES_DELETED] = True
+        return True
+    else:
+        return False
+
+
+def find_consecutive_true_regions(input_array):
+    start_index = None  # 连续True区域的起始索引
+    regions = []  # 用于保存所有连续True区域的起始和结束索引
+
+    for i in range(len(input_array)):
+        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
+        if input_array[i] and start_index is None:
+            start_index = i  # 记录连续True区域的起始索引
+
+        # 如果我们找到了一个False值，并且当前在连续True区域中
+        elif not input_array[i] and start_index is not None:
+            # 如果连续True区域长度大于1，那么将其添加到结果列表中
+            if i - start_index > 1:
+                regions.append((start_index, i - 1))
+            start_index = None  # 重置起始索引
+
+    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
+    if start_index is not None and len(input_array) - start_index > 1:
+        regions.append((start_index, len(input_array) - 1))
+
+    return regions
+
+
+def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
+    global debug_able
+    """
+    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
+    一个line居中的条件是：
+    1. 水平中心点跨越layout的中心点。
+    2. 左右两侧都有空白
+    """
+
+    for layout_i, layout_para in enumerate(page_paras):
+        layout_box = new_layout_bbox[layout_i]
+        single_line_paras_tag = []
+        for i in range(len(layout_para)):
+            # single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
+            single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1)
+        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
+        consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
+        if len(consecutive_single_line_indices) > 0:
+            """检查这些行是否是高度相同的，居中的"""
+            for start, end in consecutive_single_line_indices:
+                # start += index_offset
+                # end += index_offset
+                line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in
+                                    layout_para[start:end + 1]])
+                first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
+                if "Table" in first_line_text or "Figure" in first_line_text:
+                    pass
+                if debug_able:
+                    logger.info(line_hi.std())
+
+                if line_hi.std() < 2:
+                    """行高度相同，那么判断是否居中"""
+                    all_left_x0 = [block["lines"][0]['bbox'][0] for block in layout_para[start:end + 1]]
+                    all_right_x1 = [block["lines"][0]['bbox'][2] for block in layout_para[start:end + 1]]
+                    layout_center = (layout_box[0] + layout_box[2]) / 2
+                    if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \
+                            and not all([x0 == layout_box[0] for x0 in all_left_x0]) \
+                            and not all([x1 == layout_box[2] for x1 in all_right_x1]):
+                        merge_para = [block["lines"][0] for block in layout_para[start:end + 1]]
+                        para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
+                        if debug_able:
+                            logger.info(para_text)
+                        layout_para[start]["lines"] = merge_para
+                        for i_para in range(start + 1, end + 1):
+                            layout_para[i_para]["lines"] = []
+                            layout_para[i_para][LINES_DELETED] = True
+                        # layout_para[start:end + 1] = [merge_para]
+
+                        # index_offset -= end - start
+
+    return
+
+
+def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
+    """
+    找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。
+    """
+
+    pass
+
+
+def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
+    """
+    根据line和layout情况进行分段
+    先实现一个根据行末尾特征分段的简单方法。
+    """
+    """
+    算法思路：
+    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
+    2. 从上述行中找到末尾是句号等可作为断行标志的行。
+    3. 参照上述行尾特征进行分段。
+    4. 图、表，目前独占一行，不考虑分段。
+    """
+    blocks_group = __group_line_by_layout(blocks, layout_bboxes)  # block内分段
+    layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang)  # layout内分段
+    blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
+                                                               page_num, lang)  # layout之间连接列表段落
+    connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox)  # layout间链接段落
+
+    return connected_layout_blocks, page_list_info
+
+
+def para_split(pdf_info_dict, debug_mode, lang="en"):
+    global debug_able
+    debug_able = debug_mode
+    new_layout_of_pages = []  # 数组的数组，每个元素是一个页面的layoutS
+    all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
+    for page_num, page in pdf_info_dict.items():
+        blocks = copy.deepcopy(page['preproc_blocks'])
+        layout_bboxes = page['layout_bboxes']
+        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
+        new_layout_of_pages.append(new_layout_bbox)
+        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
+        all_page_list_info.append(page_list_info)
+        page['para_blocks'] = splited_blocks
+
+    """连接页面与页面之间的可能合并的段落"""
+    pdf_infos = list(pdf_info_dict.values())
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        if page_num == 0:
+            continue
+        pre_page_paras = pdf_infos[page_num - 1]['para_blocks']
+        next_page_paras = pdf_infos[page_num]['para_blocks']
+        pre_page_layout_bbox = new_layout_of_pages[page_num - 1]
+        next_page_layout_bbox = new_layout_of_pages[page_num]
+
+        is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
+                                            next_page_layout_bbox, page_num, lang)
+        if debug_able:
+            if is_conn:
+                logger.info(f"连接了第{page_num - 1}页和第{page_num}页的段落")
+
+        is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
+                                                 next_page_layout_bbox, all_page_list_info[page_num - 1],
+                                                 all_page_list_info[page_num], page_num, lang)
+        if debug_able:
+            if is_list_conn:
+                logger.info(f"连接了第{page_num - 1}页和第{page_num}页的列表段落")
+
+    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
+    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
+    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
+    """
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        page_paras = page['para_blocks']
+        new_layout_bbox = new_layout_of_pages[page_num]
+        __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang)
+        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
+
+    # layout展平
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        page_paras = page['para_blocks']
+        page_blocks = [block for layout in page_paras for block in layout]
+        page["para_blocks"] = page_blocks
--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
+import copy
+
+from loguru import logger
+
+from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE
+from magic_pdf.libs.ocr_content_type import BlockType, ContentType
+
+LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；')
+LIST_END_FLAG = ('.', '。', ';', '；')
+
+
+class ListLineTag:
+    IS_LIST_START_LINE = "is_list_start_line"
+    IS_LIST_END_LINE = "is_list_end_line"
+
+
+def __process_blocks(blocks):
+    # 对所有block预处理
+    # 1.通过title和interline_equation将block分组
+    # 2.bbox边界根据line信息重置
+
+    result = []
+    current_group = []
+
+    for i in range(len(blocks)):
+        current_block = blocks[i]
+
+        # 如果当前块是 text 类型
+        if current_block['type'] == 'text':
+            current_block["bbox_fs"] = copy.deepcopy(current_block["bbox"])
+            if 'lines' in current_block and len(current_block["lines"]) > 0:
+                current_block['bbox_fs'] = [min([line['bbox'][0] for line in current_block['lines']]),
+                                            min([line['bbox'][1] for line in current_block['lines']]),
+                                            max([line['bbox'][2] for line in current_block['lines']]),
+                                            max([line['bbox'][3] for line in current_block['lines']])]
+            current_group.append(current_block)
+
+        # 检查下一个块是否存在
+        if i + 1 < len(blocks):
+            next_block = blocks[i + 1]
+            # 如果下一个块不是 text 类型且是 title 或 interline_equation 类型
+            if next_block['type'] in ['title', 'interline_equation']:
+                result.append(current_group)
+                current_group = []
+
+    # 处理最后一个 group
+    if current_group:
+        result.append(current_group)
+
+    return result
+
+
+def __is_list_or_index_block(block):
+    # 一个block如果是list block 应该同时满足以下特征
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格（狗牙状）
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.多个line以endflag结尾
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 左侧不顶格
+
+    # index block 是一种特殊的list block
+    # 一个block如果是index block 应该同时满足以下特征
+    # 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
+    if len(block['lines']) >= 2:
+        first_line = block['lines'][0]
+        line_height = first_line['bbox'][3] - first_line['bbox'][1]
+        block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
+
+        left_close_num = 0
+        left_not_close_num = 0
+        right_not_close_num = 0
+        right_close_num = 0
+        lines_text_list = []
+
+        multiple_para_flag = False
+        last_line = block['lines'][-1]
+        # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 （第一行可能可以右边不顶格）
+        if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
+                # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
+                abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 and
+                block['bbox_fs'][2] - last_line['bbox'][2] > line_height
+        ):
+            multiple_para_flag = True
+
+        for line in block['lines']:
+
+            line_text = ""
+
+            for span in line['spans']:
+                span_type = span['type']
+                if span_type == ContentType.Text:
+                    line_text += span['content'].strip()
+
+            lines_text_list.append(line_text)
+
+            # 计算line左侧顶格数量是否大于2，是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
+            if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
+                left_close_num += 1
+            elif line['bbox'][0] - block['bbox_fs'][0] > line_height:
+                # logger.info(f"{line_text}, {block['bbox_fs']}, {line['bbox']}")
+                left_not_close_num += 1
+
+            # 计算右侧是否顶格
+            if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
+                right_close_num += 1
+            else:
+                # 右侧不顶格情况下是否有一段距离，拍脑袋用0.3block宽度做阈值
+                closed_area = 0.3 * block_weight
+                # closed_area = 5 * line_height
+                if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
+                    right_not_close_num += 1
+
+        # 判断lines_text_list中的元素是否有超过80%都以LIST_END_FLAG结尾
+        line_end_flag = False
+        # 判断lines_text_list中的元素是否有超过80%都以数字开头或都以数字结尾
+        line_num_flag = False
+        num_start_count = 0
+        num_end_count = 0
+        flag_end_count = 0
+        if len(lines_text_list) > 0:
+            for line_text in lines_text_list:
+                if len(line_text) > 0:
+                    if line_text[-1] in LIST_END_FLAG:
+                        flag_end_count += 1
+                    if line_text[0].isdigit():
+                        num_start_count += 1
+                    if line_text[-1].isdigit():
+                        num_end_count += 1
+
+            if flag_end_count / len(lines_text_list) >= 0.8:
+                line_end_flag = True
+
+            if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8:
+                line_num_flag = True
+
+        # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边，且符合数字规则极为index
+        if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
+                and line_num_flag
+        ):
+            for line in block['lines']:
+                line[ListLineTag.IS_LIST_START_LINE] = True
+            return BlockType.Index
+
+        elif left_close_num >= 2 and (
+                right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
+            # 处理一种特殊的没有缩进的list，所有行都贴左边，通过右边的空隙判断是否是item尾
+            if left_close_num / len(block['lines']) > 0.9:
+                # 这种是每个item只有一行，且左边都贴边的短item list
+                if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
+                    for line in block['lines']:
+                        if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
+                            line[ListLineTag.IS_LIST_START_LINE] = True
+                # 这种是大部分line item 都有结束标识符的情况，按结束标识符区分不同item
+                elif line_end_flag:
+                    for i, line in enumerate(block['lines']):
+                        if lines_text_list[i][-1] in LIST_END_FLAG:
+                            line[ListLineTag.IS_LIST_END_LINE] = True
+                            if i + 1 < len(block['lines']):
+                                block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
+                # line item基本没有结束标识符，而且也没有缩进，按右侧空隙判断哪些是item end
+                else:
+                    line_start_flag = False
+                    for i, line in enumerate(block['lines']):
+                        if line_start_flag:
+                            line[ListLineTag.IS_LIST_START_LINE] = True
+                            line_start_flag = False
+                        elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                            line[ListLineTag.IS_LIST_END_LINE] = True
+                            line_start_flag = True
+            # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头，end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致
+            elif num_start_count >= 2 and num_start_count == flag_end_count:  # 简单一点先不考虑左侧不贴边的情况
+                for i, line in enumerate(block['lines']):
+                    if lines_text_list[i][0].isdigit():
+                        line[ListLineTag.IS_LIST_START_LINE] = True
+                    if lines_text_list[i][-1] in LIST_END_FLAG:
+                        line[ListLineTag.IS_LIST_END_LINE] = True
+            else:
+                # 正常有缩进的list处理
+                for line in block['lines']:
+                    if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
+                        line[ListLineTag.IS_LIST_START_LINE] = True
+                    if abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                        line[ListLineTag.IS_LIST_END_LINE] = True
+
+            return BlockType.List
+        else:
+            return BlockType.Text
+    else:
+        return BlockType.Text
+
+
+def __merge_2_text_blocks(block1, block2):
+    if len(block1['lines']) > 0:
+        first_line = block1['lines'][0]
+        line_height = first_line['bbox'][3] - first_line['bbox'][1]
+        block1_weight = block1['bbox'][2] - block1['bbox'][0]
+        block2_weight = block2['bbox'][2] - block2['bbox'][0]
+        min_block_weight = min(block1_weight, block2_weight)
+        if abs(block1['bbox_fs'][0] - first_line['bbox'][0]) < line_height / 2:
+            last_line = block2['lines'][-1]
+            if len(last_line['spans']) > 0:
+                last_span = last_line['spans'][-1]
+                line_height = last_line['bbox'][3] - last_line['bbox'][1]
+                if (abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height and
+                        not last_span['content'].endswith(LINE_STOP_FLAG) and
+                        # 两个block宽度差距超过2倍也不合并
+                        abs(block1_weight - block2_weight) < min_block_weight
+                ):
+                    if block1['page_num'] != block2['page_num']:
+                        for line in block1['lines']:
+                            for span in line['spans']:
+                                span[CROSS_PAGE] = True
+                    block2['lines'].extend(block1['lines'])
+                    block1['lines'] = []
+                    block1[LINES_DELETED] = True
+
+    return block1, block2
+
+
+def __merge_2_list_blocks(block1, block2):
+    if block1['page_num'] != block2['page_num']:
+        for line in block1['lines']:
+            for span in line['spans']:
+                span[CROSS_PAGE] = True
+    block2['lines'].extend(block1['lines'])
+    block1['lines'] = []
+    block1[LINES_DELETED] = True
+
+    return block1, block2
+
+
+def __is_list_group(text_blocks_group):
+    # list group的特征是一个group内的所有block都满足以下条件
+    # 1.每个block都不超过3行 2. 每个block 的左边界都比较接近(逻辑简单点先不加这个规则)
+    for block in text_blocks_group:
+        if len(block['lines']) > 3:
+            return False
+    return True
+
+
+def __para_merge_page(blocks):
+    page_text_blocks_groups = __process_blocks(blocks)
+    for text_blocks_group in page_text_blocks_groups:
+
+        if len(text_blocks_group) > 0:
+            # 需要先在合并前对所有block判断是否为list or index block
+            for block in text_blocks_group:
+                block_type = __is_list_or_index_block(block)
+                block['type'] = block_type
+                # logger.info(f"{block['type']}:{block}")
+
+        if len(text_blocks_group) > 1:
+
+            # 在合并前判断这个group 是否是一个 list group
+            is_list_group = __is_list_group(text_blocks_group)
+
+            # 倒序遍历
+            for i in range(len(text_blocks_group) - 1, -1, -1):
+                current_block = text_blocks_group[i]
+
+                # 检查是否有前一个块
+                if i - 1 >= 0:
+                    prev_block = text_blocks_group[i - 1]
+
+                    if current_block['type'] == 'text' and prev_block['type'] == 'text' and not is_list_group:
+                        __merge_2_text_blocks(current_block, prev_block)
+                    elif (
+                            (current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List) or
+                            (current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index)
+                    ):
+                        __merge_2_list_blocks(current_block, prev_block)
+
+        else:
+            continue
+
+
+def para_split(pdf_info_dict, debug_mode=False):
+    all_blocks = []
+    for page_num, page in pdf_info_dict.items():
+        blocks = copy.deepcopy(page['preproc_blocks'])
+        for block in blocks:
+            block['page_num'] = page_num
+        all_blocks.extend(blocks)
+
+    __para_merge_page(all_blocks)
+    for page_num, page in pdf_info_dict.items():
+        page['para_blocks'] = []
+        for block in all_blocks:
+            if block['page_num'] == page_num:
+                page['para_blocks'].append(block)
+
+
+if __name__ == '__main__':
+    input_blocks = []
+    # 调用函数
+    groups = __process_blocks(input_blocks)
+    for group_index, group in enumerate(groups):
+        print(f"Group {group_index}: {group}")
--- a/magic_pdf/para/raw_processor.py
+++ b/magic_pdf/para/raw_processor.py
+class RawBlockProcessor:
+    def __init__(self) -> None:
+        self.y_tolerance = 2
+        self.pdf_dic = {}
+
+    def __span_flags_decomposer(self, span_flags):
+        """
+        Make font flags human readable.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        span_flags : int
+            span flags
+
+        Returns
+        -------
+        l : dict
+            decomposed flags
+        """
+
+        l = {
+            "is_superscript": False,
+            "is_italic": False,
+            "is_serifed": False,
+            "is_sans_serifed": False,
+            "is_monospaced": False,
+            "is_proportional": False,
+            "is_bold": False,
+        }
+
+        if span_flags & 2**0:
+            l["is_superscript"] = True  # 表示上标
+
+        if span_flags & 2**1:
+            l["is_italic"] = True  # 表示斜体
+
+        if span_flags & 2**2:
+            l["is_serifed"] = True  # 表示衬线字体
+        else:
+            l["is_sans_serifed"] = True  # 表示非衬线字体
+
+        if span_flags & 2**3:
+            l["is_monospaced"] = True  # 表示等宽字体
+        else:
+            l["is_proportional"] = True  # 表示比例字体
+
+        if span_flags & 2**4:
+            l["is_bold"] = True  # 表示粗体
+
+        return l
+
+    def __make_new_lines(self, raw_lines):
+        """
+        This function makes new lines.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        raw_lines : list
+            raw lines
+
+        Returns
+        -------
+        new_lines : list
+            new lines
+        """
+        new_lines = []
+        new_line = None
+
+        for raw_line in raw_lines:
+            raw_line_bbox = raw_line["bbox"]
+            raw_line_spans = raw_line["spans"]
+            raw_line_text = "".join([span["text"] for span in raw_line_spans])
+            raw_line_dir = raw_line.get("dir", None)
+
+            decomposed_line_spans = []
+            for span in raw_line_spans:
+                raw_flags = span["flags"]
+                decomposed_flags = self.__span_flags_decomposer(raw_flags)
+                span["decomposed_flags"] = decomposed_flags
+                decomposed_line_spans.append(span)
+
+            if new_line is None:
+                new_line = {
+                    "bbox": raw_line_bbox,
+                    "text": raw_line_text,
+                    "dir": raw_line_dir if raw_line_dir else (0, 0),
+                    "spans": decomposed_line_spans,
+                }
+            else:
+                if (
+                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
+                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
+                ):
+                    new_line["bbox"] = (
+                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
+                        new_line["bbox"][1],  # top
+                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
+                        raw_line_bbox[3],  # bottom
+                    )
+                    new_line["text"] += " " + raw_line_text
+                    new_line["spans"].extend(raw_line_spans)
+                    new_line["dir"] = (
+                        new_line["dir"][0] + raw_line_dir[0],
+                        new_line["dir"][1] + raw_line_dir[1],
+                    )
+                else:
+                    new_lines.append(new_line)
+                    new_line = {
+                        "bbox": raw_line_bbox,
+                        "text": raw_line_text,
+                        "dir": raw_line_dir if raw_line_dir else (0, 0),
+                        "spans": raw_line_spans,
+                    }
+        if new_line:
+            new_lines.append(new_line)
+
+        return new_lines
+
+    def __make_new_block(self, raw_block):
+        """
+        This function makes a new block.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        raw_block : dict
+            a raw block
+
+        Returns
+        -------
+        new_block : dict
+
+        Schema of new_block:
+        {
+            "block_id": "block_1",
+            "bbox": [0, 0, 100, 100],
+            "text": "This is a block.",
+            "lines": [
+                {
+                    "bbox": [0, 0, 100, 100],
+                    "text": "This is a line.",
+                    "spans": [
+                        {
+                            "text": "This is a span.",
+                            "font": "Times New Roman",
+                            "size": 12,
+                            "color": "#000000",
+                        }
+                    ],
+                }
+            ],
+        }
+        """
+        new_block = {}
+
+        block_id = raw_block["number"]
+        block_bbox = raw_block["bbox"]
+        block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
+        raw_lines = raw_block["lines"]
+        block_lines = self.__make_new_lines(raw_lines)
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["lines"] = block_lines
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "preproc_blocks" in blocks.keys():
+                    input_blocks = blocks["preproc_blocks"]
+                    for raw_block in input_blocks:
+                        new_block = self.__make_new_block(raw_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
--- a/magic_pdf/para/stats.py
+++ b/magic_pdf/para/stats.py
+from collections import Counter
+import numpy as np
+
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class BlockStatisticsCalculator:
+    def __init__(self) -> None:
+        pass
+
+    def __calc_stats_of_new_lines(self, new_lines):
+        """
+        This function calculates the paragraph metrics
+
+        Parameters
+        ----------
+        combined_lines : list
+            combined lines
+
+        Returns
+        -------
+        X0 : float
+            Median of x0 values, which represents the left average boundary of the block
+        X1 : float
+            Median of x1 values, which represents the right average boundary of the block
+        avg_char_width : float
+            Average of char widths, which represents the average char width of the block
+        avg_char_height : float
+            Average of line heights, which represents the average line height of the block
+
+        """
+        x0_values = []
+        x1_values = []
+        char_widths = []
+        char_heights = []
+
+        block_font_types = []
+        block_font_sizes = []
+        block_directions = []
+
+        if len(new_lines) > 0:
+            for i, line in enumerate(new_lines):
+                line_bbox = line["bbox"]
+                line_text = line["text"]
+                line_spans = line["spans"]
+
+                num_chars = len([ch for ch in line_text if not ch.isspace()])
+
+                x0_values.append(line_bbox[0])
+                x1_values.append(line_bbox[2])
+
+                if num_chars > 0:
+                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
+                    char_widths.append(char_width)
+
+                for span in line_spans:
+                    block_font_types.append(span["font"])
+                    block_font_sizes.append(span["size"])
+
+                if "dir" in line:
+                    block_directions.append(line["dir"])
+
+                # line_font_types = [span["font"] for span in line_spans]
+                char_heights = [span["size"] for span in line_spans]
+
+        X0 = np.median(x0_values) if x0_values else 0
+        X1 = np.median(x1_values) if x1_values else 0
+        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
+        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
+
+        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
+
+        max_span_length = 0
+        max_span_font_type = None
+        for line in new_lines:
+            line_spans = line["spans"]
+            for span in line_spans:
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_span_length:
+                    max_span_length = span_length
+                    max_span_font_type = span["font"]
+
+        max_freq_font_type = max_span_font_type
+
+        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
+
+        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
+
+        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
+
+        return (
+            X0,
+            X1,
+            avg_char_width,
+            avg_char_height,
+            max_freq_font_type,
+            avg_font_size,
+            (avg_dir_horizontal, avg_dir_vertical),
+            median_font_size,
+        )
+
+    def __make_new_block(self, input_block):
+        new_block = {}
+
+        raw_lines = input_block["lines"]
+        stats = self.__calc_stats_of_new_lines(raw_lines)
+
+        block_id = input_block["block_id"]
+        block_bbox = input_block["bbox"]
+        block_text = input_block["text"]
+        block_lines = raw_lines
+        block_avg_left_boundary = stats[0]
+        block_avg_right_boundary = stats[1]
+        block_avg_char_width = stats[2]
+        block_avg_char_height = stats[3]
+        block_font_type = stats[4]
+        block_font_size = stats[5]
+        block_direction = stats[6]
+        block_median_font_size = stats[7]
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["dir"] = block_direction
+        new_block["X0"] = block_avg_left_boundary
+        new_block["X1"] = block_avg_right_boundary
+        new_block["avg_char_width"] = block_avg_char_width
+        new_block["avg_char_height"] = block_avg_char_height
+        new_block["block_font_type"] = block_font_type
+        new_block["block_font_size"] = block_font_size
+        new_block["lines"] = block_lines
+        new_block["median_font_size"] = block_median_font_size
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self.__make_new_block(input_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
+
+class DocStatisticsCalculator:
+    def __init__(self) -> None:
+        pass
+
+    def calc_stats_of_doc(self, pdf_dict):
+        """
+        This function computes the statistics of the document
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        statistics : dict
+            statistics of the document
+        """
+
+        total_text_length = 0
+        total_num_blocks = 0
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    for para_block in para_blocks:
+                        total_text_length += len(para_block["text"])
+                        total_num_blocks += 1
+
+        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
+
+        font_list = []
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        block_text_length = len(input_block.get("text", ""))
+                        if block_text_length < avg_text_length * 0.5:
+                            continue
+                        block_font_type = safe_get(input_block, "block_font_type", "")
+                        block_font_size = safe_get(input_block, "block_font_size", 0)
+                        font_list.append((block_font_type, block_font_size))
+
+        font_counter = Counter(font_list)
+        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
+        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
+
+        statistics = {
+            "num_pages": 0,
+            "num_blocks": 0,
+            "num_paras": 0,
+            "num_titles": 0,
+            "num_header_blocks": 0,
+            "num_footer_blocks": 0,
+            "num_watermark_blocks": 0,
+            "num_vertical_margin_note_blocks": 0,
+            "most_common_font_type": most_common_font[0][0],
+            "most_common_font_size": most_common_font[0][1],
+            "number_of_most_common_font": most_common_font[1],
+            "second_most_common_font_type": second_most_common_font[0][0],
+            "second_most_common_font_size": second_most_common_font[0][1],
+            "number_of_second_most_common_font": second_most_common_font[1],
+            "avg_text_length": avg_text_length,
+        }
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                blocks = pdf_dict[page_id]["para_blocks"]
+                statistics["num_pages"] += 1
+                for block_id, block_data in enumerate(blocks):
+                    statistics["num_blocks"] += 1
+
+                    if "paras" in block_data.keys():
+                        statistics["num_paras"] += len(block_data["paras"])
+
+                    for line in block_data["lines"]:
+                        if line.get("is_title", 0):
+                            statistics["num_titles"] += 1
+
+                    if block_data.get("is_header", 0):
+                        statistics["num_header_blocks"] += 1
+                    if block_data.get("is_footer", 0):
+                        statistics["num_footer_blocks"] += 1
+                    if block_data.get("is_watermark", 0):
+                        statistics["num_watermark_blocks"] += 1
+                    if block_data.get("is_vertical_margin_note", 0):
+                        statistics["num_vertical_margin_note_blocks"] += 1
+
+        pdf_dict["statistics"] = statistics
+
+        return pdf_dict
+
+
--- a/magic_pdf/para/title_processor.py
+++ b/magic_pdf/para/title_processor.py
+import os
+import re
+import numpy as np
+
+from magic_pdf.libs.nlp_utils import NLPModels
+
+from magic_pdf.para.commons import *
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class TitleProcessor:
+    def __init__(self, *doc_statistics) -> None:
+        if len(doc_statistics) > 0:
+            self.doc_statistics = doc_statistics[0]
+
+        self.nlp_model = NLPModels()
+        self.MAX_TITLE_LEVEL = 3
+        self.numbered_title_pattern = r"""
+            ^                                 # 行首
+            (                                 # 开始捕获组
+                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
+                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
+                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
+                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
+                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
+                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
+                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
+                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
+                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
+                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
+                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
+                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
+                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
+                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
+                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
+                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
+                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
+            )                                 # 结束捕获组
+            .+                                # 标题的其余部分
+        """
+
+    def _is_potential_title(
+        self,
+        curr_line,
+        prev_line,
+        prev_line_is_title,
+        next_line,
+        avg_char_width,
+        avg_char_height,
+        median_font_size,
+    ):
+        """
+        This function checks if the line is a potential title.
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        avg_char_width : float
+            average of char widths
+        avg_char_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a potential title, False otherwise.
+        """
+
+        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
+            """
+            This function checks if the line is centered on the page
+
+            Parameters
+            ----------
+            line_bbox : list
+                bbox of the line
+            page_bbox : list
+                bbox of the page
+            avg_char_width : float
+                average of char widths
+
+            Returns
+            -------
+            bool
+                True if the line is centered on the page, False otherwise.
+            """
+            horizontal_ratio = 0.5
+            horizontal_thres = horizontal_ratio * avg_char_width
+
+            x0, _, x1, _ = line_bbox
+            _, _, page_x1, _ = page_bbox
+
+            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
+
+        def __is_bold_font_line(line):
+            """
+            Check if a line contains any bold font style.
+            """
+
+            def _is_bold_span(span):
+                # if span text is empty or only contains space, return False
+                if not span["text"].strip():
+                    return False
+
+                return bool(span["flags"] & 2**4)  # Check if the font is bold
+
+            for span in line["spans"]:
+                if not _is_bold_span(span):
+                    return False
+
+            return True
+
+        def __is_italic_font_line(line):
+            """
+            Check if a line contains any italic font style.
+            """
+
+            def __is_italic_span(span):
+                return bool(span["flags"] & 2**1)  # Check if the font is italic
+
+            for span in line["spans"]:
+                if not __is_italic_span(span):
+                    return False
+
+            return True
+
+        def __is_punctuation_heavy(line_text):
+            """
+            Check if the line contains a high ratio of punctuation marks, which may indicate
+            that the line is not a title.
+
+            Parameters:
+            line_text (str): Text of the line.
+
+            Returns:
+            bool: True if the line is heavy with punctuation, False otherwise.
+            """
+            # Pattern for common title format like "X.Y. Title"
+            pattern = r"\b\d+\.\d+\..*\b"
+
+            # If the line matches the title format, return False
+            if re.match(pattern, line_text.strip()):
+                return False
+
+            # Find all punctuation marks in the line
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.1:
+                return True
+
+            return False
+
+        def __has_mixed_font_styles(spans, strict_mode=False):
+            """
+            This function checks if the line has mixed font styles, the strict mode will compare the font types
+
+            Parameters
+            ----------
+            spans : list
+                spans of the line
+            strict_mode : bool
+                True for strict mode, the font types will be fully compared
+                False for non-strict mode, the font types will be compared by the most longest common prefix
+
+            Returns
+            -------
+            bool
+                True if the line has mixed font styles, False otherwise.
+            """
+            if strict_mode:
+                font_styles = set()
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.add(font_style)
+
+                return len(font_styles) > 1
+
+            else:  # non-strict mode
+                font_styles = []
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.append(font_style)
+
+                if len(font_styles) > 1:
+                    longest_common_prefix = os.path.commonprefix(font_styles)
+                    if len(longest_common_prefix) > 0:
+                        return False
+                    else:
+                        return True
+                else:
+                    return False
+
+        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
+            """
+            This function checks if the current line has a different font type from the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            prev_line_font_type : str
+                font type of the previous line
+            next_line_font_type : str
+                font type of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a different font type from the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_type != other_font_type.lower()
+                for other_font_type in [prev_line_font_type, next_line_font_type]
+                if other_font_type is not None
+            )
+
+        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
+            """
+            This function checks if the current line has a larger font size than the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            prev_line_font_size : float
+                font size of the previous line
+            next_line_font_size : float
+                font size of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a larger font size than the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_size > other_font_size * 1.2
+                for other_font_size in [prev_line_font_size, next_line_font_size]
+                if other_font_size is not None
+            )
+
+        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
+            """
+            This function checks if the current line is similar to the previous line
+
+            Parameters
+            ----------
+            curr_line : dict
+                current line
+            prev_line : dict
+                previous line
+
+            Returns
+            -------
+            bool
+                True if the current line is similar to the previous line, False otherwise.
+            """
+
+            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
+                return True
+            else:
+                return False
+
+        def __is_same_font_type_of_docAvg(curr_line_font_type):
+            """
+            This function checks if the current line has the same font type as the document average font type
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line has the same font type as the document average font type, False otherwise.
+            """
+            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
+            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
+
+            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
+
+        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
+            """
+            This function checks if the current line has a large enough font size
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            ratio : float
+                ratio of the current line font size to the document average font size
+
+            Returns
+            -------
+            bool
+                True if the current line has a large enough font size, False otherwise.
+            """
+            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
+            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
+            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
+
+            return curr_line_font_size >= doc_avg_font_size * ratio
+
+        def __is_sufficient_spacing_above_and_below(
+            curr_line_bbox,
+            prev_line_bbox,
+            next_line_bbox,
+            avg_char_height,
+            median_font_size,
+        ):
+            """
+            This function checks if the current line has sufficient spacing above and below
+
+            Parameters
+            ----------
+            curr_line_bbox : list
+                bbox of the current line
+            prev_line_bbox : list
+                bbox of the previous line
+            next_line_bbox : list
+                bbox of the next line
+            avg_char_width : float
+                average of char widths
+            avg_char_height : float
+                average of line heights
+
+            Returns
+            -------
+            bool
+                True if the current line has sufficient spacing above and below, False otherwise.
+            """
+            vertical_ratio = 1.25
+            vertical_thres = vertical_ratio * median_font_size
+
+            _, y0, _, y1 = curr_line_bbox
+
+            sufficient_spacing_above = False
+            if prev_line_bbox:
+                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
+                sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            else:
+                sufficient_spacing_above = True
+
+            sufficient_spacing_below = False
+            if next_line_bbox:
+                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
+                sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            else:
+                sufficient_spacing_below = True
+
+            return (sufficient_spacing_above, sufficient_spacing_below)
+
+        def __is_word_list_line_by_rules(curr_line_text):
+            """
+            This function checks if the current line is a word list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
+            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
+
+            compiled_pattern = re.compile(name_list_pattern)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        # """
+        def __get_text_catgr_by_nlp(curr_line_text):
+            """
+            This function checks if the current line is a name list using nlp model, such as spacy
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+
+            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
+
+            return result
+
+        # """
+
+        def __is_numbered_title(curr_line_text):
+            """
+            This function checks if the current line is a numbered list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a numbered list, False otherwise.
+            """
+
+            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        def __is_end_with_ending_puncs(line_text):
+            """
+            This function checks if the current line ends with a ending punctuation mark
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line ends with a punctuation mark, False otherwise.
+            """
+            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+
+            line_text = line_text.rstrip()
+            if line_text[-1] in end_puncs:
+                return True
+
+            return False
+
+        def __contains_only_no_meaning_symbols(line_text):
+            """
+            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
+            Situation contains:
+            1. Only have punctuation marks
+            2. Only have other non-meaning symbols
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line contains only symbols that have no meaning, False otherwise.
+            """
+
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.9:
+                return True
+
+            return False
+
+        def __is_equation(line_text):
+            """
+            This function checks if the current line is an equation.
+
+            Parameters
+            ----------
+            line_text : str
+
+            Returns
+            -------
+            bool
+                True if the current line is an equation, False otherwise.
+            """
+            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
+
+            if re.search(equation_reg, line_text):
+                return True
+            else:
+                return False
+
+        def __is_title_by_len(text, max_length=200):
+            """
+            This function checks if the current line is a title by length.
+
+            Parameters
+            ----------
+            text : str
+                text of the current line
+
+            max_length : int
+                max length of the title
+
+            Returns
+            -------
+            bool
+                True if the current line is a title, False otherwise.
+
+            """
+            text = text.strip()
+            return len(text) <= max_length
+
+        def __compute_line_font_type_and_size(curr_line):
+            """
+            This function computes the font type and font size of the line.
+
+            Parameters
+            ----------
+            line : dict
+                line
+
+            Returns
+            -------
+            font_type : str
+                font type of the line
+            font_size : float
+                font size of the line
+            """
+            spans = curr_line["spans"]
+            max_accumulated_length = 0
+            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
+            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
+            for span in spans:
+                if span["text"].isspace():
+                    continue
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_accumulated_length:
+                    max_accumulated_length = span_length
+                    max_span_font_size = span["size"]
+                    max_span_font_type = span["font"].lower()
+
+            return max_span_font_type, max_span_font_size
+
+        """
+        Title detecting main Process.
+        """
+
+        """
+        Basic features about the current line.
+        """
+        curr_line_bbox = curr_line["bbox"]
+        curr_line_text = curr_line["text"]
+        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
+
+        if len(curr_line_text.strip()) == 0:  # skip empty lines
+            return False
+
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        if prev_line:
+            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
+        else:
+            prev_line_font_type, prev_line_font_size = None, None
+
+        next_line_bbox = next_line["bbox"] if next_line else None
+        if next_line:
+            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
+        else:
+            next_line_font_type, next_line_font_size = None, None
+
+        """
+        Aggregated features about the current line.
+        """
+        is_italc_font = __is_italic_font_line(curr_line)
+        is_bold_font = __is_bold_font_line(curr_line)
+
+        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
+        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
+        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
+
+        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
+
+        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
+
+        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
+        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
+
+        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
+
+        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
+        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
+
+        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
+            curr_line_font_size, prev_line_font_size, next_line_font_size
+        )
+
+        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
+            curr_line_font_type, prev_line_font_type, next_line_font_type
+        )
+
+        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
+        )
+
+        is_similar_to_pre_line = __is_similar_to_pre_line(
+            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
+        )
+
+        """
+        Further aggregated features about the current line.
+        
+        Attention:
+            Features that start with __ are for internal use.
+        """
+
+        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
+        )
+        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
+        is_a_left_inline_title = (
+            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
+        )
+
+        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
+        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
+
+        is_title_by_check_pre_and_next_line = (
+            (prev_line is not None or next_line is not None)
+            and has_sufficient_spaces_above
+            and has_sufficient_spaces_below
+            and is_potential_title_font
+        )
+
+        is_numbered_title = __is_numbered_title(curr_line_text) and (
+            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
+        )
+
+        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
+
+        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
+
+        is_equation = __is_equation(curr_line_text)
+
+        is_title_by_len = __is_title_by_len(curr_line_text)
+
+        """
+        Decide if the line is a title.
+        """
+        # is_title = False
+        # if prev_line_is_title:
+
+        is_title = (
+            is_not_end_with_ending_puncs  # not end with ending punctuation marks
+            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
+            and is_title_by_len  # is a title by length, default max length is 200
+            and not is_equation  # an interline equation should never be a title
+            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
+            and (
+                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                or (
+                    is_much_larger_font_than_doc_avg
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_font_size_little_less_than_doc_avg
+                    and is_bold_font
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+            )  # not the same font type as the document average font type, which includes the most common font type and the second most common font type
+            and (
+                (
+                    not is_person_or_org_list_line_by_nlp
+                    and (
+                        is_much_larger_font_than_doc_avg
+                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                    )
+                )
+                or (
+                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
+                    and not is_a_left_inline_title
+                    and not is_punctuation_heavy
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_person_or_org_list_line_by_nlp
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                )
+                or (is_numbered_title and not is_a_left_inline_title)
+            )
+        )
+        # ) or (is_similar_to_pre_line and prev_line_is_title)
+
+        is_name_or_org_list_to_be_removed = (
+            (is_person_or_org_list_line_by_nlp)
+            and is_punctuation_heavy
+            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+        ) and not is_title
+
+        if is_name_or_org_list_to_be_removed:
+            is_author_or_org_list = True
+            # print curr_line_text to check
+            # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
+        else:
+            is_author_or_org_list = False
+        """
+        # print reason why the line is a title
+        if is_title:
+            print_green("This line is a title.")
+            print_green("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+        # print reason why the line is not a title
+        line_text = curr_line_text.strip()
+        test_text = "Career/Personal Life"
+        text_content_condition = line_text == test_text
+        
+        if not is_title and text_content_condition: # Print specific line
+        # if not is_title: # Print each line
+            print_red("This line is not a title.")
+            print_red("↓" * 10)
+
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+            if is_not_end_with_ending_puncs:
+                print_green(f"is_not_end_with_ending_puncs")
+            else:
+                print_red(f"is_end_with_ending_puncs")
+
+            if is_not_only_no_meaning_symbols:
+                print_green(f"is_not_only_no_meaning_symbols")
+            else:
+                print_red(f"is_only_no_meaning_symbols")
+
+            if is_title_by_len:
+                print_green(f"is_title_by_len: {is_title_by_len}")
+            else:
+                print_red(f"is_not_title_by_len: {is_title_by_len}")
+
+            if is_equation:
+                print_red(f"is_equation")
+            else:
+                print_green(f"is_not_equation")
+
+            if is_potential_title_font:
+                print_green(f"is_potential_title_font")
+            else:
+                print_red(f"is_not_potential_title_font")
+
+            if is_punctuation_heavy:
+                print_red("is_punctuation_heavy")
+            else:
+                print_green("is_not_punctuation_heavy")
+
+            if is_bold_font:
+                print_green(f"is_bold_font")
+            else:
+                print_red(f"is_not_bold_font")
+
+            if is_font_size_not_less_than_doc_avg:
+                print_green(f"is_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_larger_font_than_doc_avg")
+
+            if is_much_larger_font_than_doc_avg:
+                print_green(f"is_much_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_much_larger_font_than_doc_avg")
+
+            if is_not_same_font_type_of_docAvg:
+                print_green(f"is_not_same_font_type_of_docAvg")
+            else:
+                print_red(f"is_same_font_type_of_docAvg")
+
+            if is_word_list_line_by_rules:
+                print_red("is_word_list_line_by_rules")
+            else:
+                print_green("is_not_name_list_by_rules")
+
+            if is_person_or_org_list_line_by_nlp:
+                print_red("is_person_or_org_list_line_by_nlp")
+            else:
+                print_green("is_not_person_or_org_list_line_by_nlp")
+
+            if not is_numbered_title:
+                print_red("is_not_numbered_title")
+            else:
+                print_green("is_numbered_title")
+
+            if is_a_left_inline_title:
+                print_red("is_a_left_inline_title")
+            else:
+                print_green("is_not_a_left_inline_title")
+
+            if not is_title_by_check_prev_line:
+                print_red("is_not_title_by_check_prev_line")
+            else:
+                print_green("is_title_by_check_prev_line")
+
+            if not is_title_by_check_next_line:
+                print_red("is_not_title_by_check_next_line")
+            else:
+                print_green("is_title_by_check_next_line")
+
+            if not is_title_by_check_pre_and_next_line:
+                print_red("is_not_title_by_check_pre_and_next_line")
+            else:
+                print_green("is_title_by_check_pre_and_next_line")
+
+        # print_green("Common features:")
+        # print_green("↓" * 10)
+
+        # print(f"    curr_line_font_type: {curr_line_font_type}")
+        # print(f"    curr_line_font_size: {curr_line_font_size}")
+        # print()
+
+        """
+
+        return is_title, is_author_or_org_list
+
+    def _detect_block_title(self, input_block):
+        """
+        Use the functions 'is_potential_title' to detect titles of each paragraph block.
+        If a line is a title, then the value of key 'is_title' of the line will be set to True.
+        """
+
+        raw_lines = input_block["lines"]
+
+        prev_line_is_title_flag = False
+
+        for i, curr_line in enumerate(raw_lines):
+            prev_line = raw_lines[i - 1] if i > 0 else None
+            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
+
+            blk_avg_char_width = input_block["avg_char_width"]
+            blk_avg_char_height = input_block["avg_char_height"]
+            blk_media_font_size = input_block["median_font_size"]
+
+            is_title, is_author_or_org_list = self._is_potential_title(
+                curr_line,
+                prev_line,
+                prev_line_is_title_flag,
+                next_line,
+                blk_avg_char_width,
+                blk_avg_char_height,
+                blk_media_font_size,
+            )
+
+            if is_title:
+                curr_line["is_title"] = is_title
+                prev_line_is_title_flag = True
+            else:
+                curr_line["is_title"] = False
+                prev_line_is_title_flag = False
+
+            if is_author_or_org_list:
+                curr_line["is_author_or_org_list"] = is_author_or_org_list
+            else:
+                curr_line["is_author_or_org_list"] = False
+
+        return input_block
+
+    def batch_process_blocks_detect_titles(self, pdf_dic):
+        """
+        This function batch process the blocks to detect titles.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        num_titles = 0
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in para_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    new_para_blocks = []
+                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_block = self._detect_block_title(para_block)
+                            new_para_blocks.append(new_block)
+                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                    else:  # All blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_para_blocks.append(para_block)
+                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
+                    para_blocks = new_para_blocks
+
+                blocks["para_blocks"] = para_blocks
+
+                for para_block in para_blocks:
+                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
+                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
+                    if (
+                        all_titles and para_text_len < 200
+                    ):  # total length of the paragraph is less than 200, more than this should not be a title
+                        para_block["is_block_title"] = 1
+                    else:
+                        para_block["is_block_title"] = 0
+
+                    all_name_or_org_list_to_be_removed = all(
+                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
+                    )
+                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
+                        para_block["is_block_an_author_or_org_list"] = 1
+                    else:
+                        para_block["is_block_an_author_or_org_list"] = 0
+
+        pdf_dic["statistics"]["num_titles"] = num_titles
+
+        return pdf_dic
+
+    def __determine_size_based_level(self, title_blocks):
+        """
+        This function determines the title level based on the font size of the title.
+
+        Parameters
+        ----------
+        title_blocks : list
+
+        Returns
+        -------
+        title_blocks : list
+        """
+
+        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
+
+        # Use the mean and std of font sizes to remove extreme values
+        mean_font_size = np.mean(font_sizes)
+        std_font_size = np.std(font_sizes)
+        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
+        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
+
+        # Compute the threshold for title level
+        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
+        if middle_font_sizes.size > 0:
+            middle_mean_font_size = np.mean(middle_font_sizes)
+            level_threshold = middle_mean_font_size
+        else:
+            level_threshold = mean_font_size
+
+        for tb in title_blocks:
+            title_block = tb["block"]
+            title_font_size = safe_get(title_block, "block_font_size", 0)
+
+            current_level = 1  # Initialize title level, the biggest level is 1
+
+            # print(f"Before adjustment by font size, {current_level}")
+            if title_font_size >= max_extreme_font_size:
+                current_level = 1
+            elif title_font_size <= min_extreme_font_size:
+                current_level = 3
+            elif float(title_font_size) >= float(level_threshold):
+                current_level = 2
+            else:
+                current_level = 3
+            # print(f"After adjustment by font size, {current_level}")
+
+            title_block["block_title_level"] = current_level
+
+        return title_blocks
+
+    def batch_process_blocks_recog_title_level(self, pdf_dic):
+        title_blocks = []
+
+        # Collect all titles
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = blocks.get("para_blocks", [])
+                for block in para_blocks:
+                    if block.get("is_block_title"):
+                        title_obj = {"page_id": page_id, "block": block}
+                        title_blocks.append(title_obj)
+
+        # Determine title level
+        if title_blocks:
+            # Determine title level based on font size
+            title_blocks = self.__determine_size_based_level(title_blocks)
+
+        return pdf_dic
--- a/magic_pdf/parse/__init__.py
+++ b/magic_pdf/parse/__init__.py
--- a/magic_pdf/parse/common_parse.py
+++ b/magic_pdf/parse/common_parse.py
+# -*- coding: utf-8 -*-
+import time
+
+from loguru import logger
+import argparse
+from pdf_client import ocrPdfClient
+from excel_parse import ExcelParser
+import os
+import requests
+import configparser
+from magic_pdf.parse.ofd_parse import parse_ofd
+
+logger.add("parse.log", rotation="10 MB", level="INFO", format="{time} {level} {message}", encoding='utf-8', enqueue=True)
+config_path = None
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--config_path',
+        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
+        )
+    parser.add_argument(
+        '--path',
+        '-p',
+        required=True
+        )
+    parser.add_argument(
+        '--output_dir',
+        '-o',
+        required=True
+        )
+    args = parser.parse_args()
+    return args
+
+def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
+    """Process a single file for OCR based on its extension."""
+    try:
+        res = ''
+        start = time.time()
+        if file_path.endswith('.pdf'):
+
+            res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir)
+        elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
+            res = excel_ocr.parse(file_path, output_dir)
+        elif file_path.endswith('.ofd'):
+            res = parse_ofd(config_path,file_path,output_dir)
+
+        end = time.time()
+
+        if res:
+            logger.info(f"文件处理成功，输出文件路径为: '{res}', 耗时为：{round(end-start,2)}")
+        else:
+            logger.warning(f"文件处理结果为空: '{file_path}'")
+    except requests.exceptions.RequestException as req_err:
+        logger.error(f"请求错误，文件: '{file_path}'，错误信息: {req_err}")
+    except Exception as err:
+        logger.error(f"处理文件时发生未知错误: '{file_path}'，错误信息: {err},res:{res}")
+
+def normalize_path(input_path):
+    """Normalize file paths to use forward slashes."""
+    return input_path.replace('\\', '/')
+
+def determine_output_dir(output_dir):
+    """Determine if the output directory is an absolute path, else make it absolute."""
+    if not os.path.isabs(output_dir):
+        current_working_directory = os.getcwd()
+        return os.path.join(current_working_directory, output_dir)
+    return output_dir
+
+def process_input(input_path, pdf_ocr, excel_ocr, output_dir):
+    """Process the input path, which can be a directory or a single file."""
+    if os.path.isdir(input_path):
+        logger.info(f'开始处理{input_path}目录下的文件')
+        for root, _, files in os.walk(input_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                logger.info(f'正在解析文件: {file_path}')
+                process_file(file_path, pdf_ocr, excel_ocr, output_dir)
+    else:
+        logger.info(f'正在解析单个文件: {input_path}')
+        process_file(input_path, pdf_ocr, excel_ocr, output_dir)
+
+def main():
+    args = parse_args()
+    input_path = normalize_path(args.path)
+    output_dir = determine_output_dir(args.output_dir)
+    config = configparser.ConfigParser()
+    config.read(args.config_path)
+    global config_path
+    config_path = args.config_path
+    pdf_server = config.get('server', 'pdf_server')
+    pdf_ocr = ocrPdfClient(pdf_server)
+    status = pdf_ocr.check_health()
+    if not status:
+        pdf_ocr = None
+        logger.warning(f'Health check failed. The server at "{pdf_server}" is not responding as expected.')
+        logger.info(f'文件解析服务无法正常运行')
+        return None
+    else:
+        excel_ocr = ExcelParser()
+
+        # logger.info(f'输入目录或文件的路径为: {input_path},输出目录为: {output_dir}')
+        # logger.info(f'输出目录为: {output_dir}')
+
+        process_input(input_path, pdf_ocr, excel_ocr, output_dir)
+
+
+if __name__ == "__main__":
+    main()
+
+
--- a/magic_pdf/parse/excel_parse.py
+++ b/magic_pdf/parse/excel_parse.py
+# -*- coding: utf-8 -*-
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from io import BytesIO
+from openpyxl import load_workbook
+
+def find_codec(blob):
+    global all_codecs
+    for c in all_codecs:
+        try:
+            blob[:1024].decode(c)
+            return c
+        except Exception as e:
+            pass
+        try:
+            blob.decode(c)
+            return c
+        except Exception as e:
+            pass
+
+    return "utf-8"
+
+class ExcelParser:
+    def html(self, fnm, chunk_rows=256):
+        if isinstance(fnm, str):
+            wb = load_workbook(fnm)
+        else:
+            wb = load_workbook(BytesIO(fnm))
+
+        tb_chunks = []
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            if not rows: continue
+
+            tb_rows_0 = "<tr>"
+            for t in list(rows[0]):
+                tb_rows_0 += f"<th>{t.value}</th>"
+            tb_rows_0 += "</tr>"
+
+            for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
+                tb = ""
+                tb += f"<table><caption>{sheetname}</caption>"
+                tb += tb_rows_0
+                for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
+                    tb += "<tr>"
+                    for i, c in enumerate(r):
+                        if c.value is None:
+                            tb += "<td></td>"
+                        else:
+                            tb += f"<td>{c.value}</td>"
+                    tb += "</tr>"
+                tb += "</table>\n"
+                tb_chunks.append(tb)
+
+        return tb_chunks
+
+    def parse(self, fnm,onm):
+        if isinstance(fnm, str):
+            wb = load_workbook(fnm)
+        else:
+            wb = load_workbook(BytesIO(fnm))
+        res = []
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            if not rows: continue
+            ti = list(rows[0])
+            for r in list(rows[1:]):
+                l = []
+                for i, c in enumerate(r):
+                    if not c.value:
+                        continue
+                    t = str(ti[i].value) if i < len(ti) else ""
+                    t += ("：" if t else "") + str(c.value)
+                    l.append(t)
+                l = "; ".join(l)
+                if sheetname.lower().find("sheet") < 0:
+                    l += " ——" + sheetname
+                res.append(l)
+
+        onm = onm + fnm.split('/')[-1].split('.xls')[0] + '.txt'
+        with open(onm,'w',encoding='utf-8') as f:
+            for _ in res:
+                f.write(_)
+                f.write('\n')
+        f.close()
+        return onm
+
+    @staticmethod
+    def row_number(fnm, binary):
+        if fnm.split(".")[-1].lower().find("xls") >= 0:
+            wb = load_workbook(BytesIO(binary))
+            total = 0
+            for sheetname in wb.sheetnames:
+                ws = wb[sheetname]
+                total += len(list(ws.rows))
+                return total
+
+        if fnm.split(".")[-1].lower() in ["csv", "txt"]:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+            return len(txt.split("\n"))
+
+
+if __name__ == "__main__":
+    psr = ExcelParser()
+    contents=psr.parse('D:\\Course_project\\DCU_MinerU\\docs\\data\\接口人.xlsx')
+    out_text = '接口人.txt'
+    with open(out_text,'w',encoding='utf-8') as f:
+        for one in contents:
+            f.write(one)
+            f.write('\n')
+    f.close()
--- a/magic_pdf/parse/ofd_parse.py
+++ b/magic_pdf/parse/ofd_parse.py
+# -*- coding: utf-8 -*-
+import base64
+import os
+from magic_pdf.tools.ofd import OFD
+from loguru import logger
+from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
+# from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
+import configparser
+from magic_pdf.parse.pdf_client import ocrPdfClient
+import html
+
+def decode_html_entities(text):
+    # 将 HTML 实体转换为相应的字符
+    return html.unescape(text)
+
+def json_to_txt(json_data):
+    txt_lines = []
+
+    def parse_dict(d, indent=0):
+        for key, value in d.items():
+            if isinstance(value, dict):
+                txt_lines.append(f"{' ' * indent}{key}:")
+                parse_dict(value, indent + 2)
+            elif isinstance(value, list):
+                txt_lines.append(f"{' ' * indent}{key}:")
+                parse_list(value, indent + 2)
+            else:
+                txt_lines.append(f"{' ' * indent}{key}: {value}")
+
+    def parse_list(lst, indent=0):
+        for i, item in enumerate(lst):
+            if isinstance(item, dict):
+                txt_lines.append(f"{' ' * indent}- Item {i + 1}:")
+                parse_dict(item, indent + 2)
+            elif isinstance(item, list):
+                txt_lines.append(f"{' ' * indent}- List {i + 1}:")
+                parse_list(item, indent + 2)
+            else:
+                txt_lines.append(f"{' ' * indent}- {item}")
+
+    # Start parsing JSON data
+    if isinstance(json_data, dict):
+        parse_dict(json_data)
+    elif isinstance(json_data, list):
+        parse_list(json_data)
+    else:
+        txt_lines.append(str(json_data))
+
+    return "\n".join(txt_lines)
+
+
+def ofd2pdf(file_path,output_dir,pdfbytes):
+    """
+    ofd2pdf
+    ofd2img
+    """
+    file_prefix = os.path.splitext(os.path.split(file_path)[1])[0]
+    # logger.info(f'file_prefix:{file_prefix}')
+    # logger.info(f'file_path:{file_path}')
+    with open(file_path, "rb") as f:
+        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
+    ofd = OFD()  # 初始化OFD 工具类
+    file_outpath = os.path.join(output_dir, file_prefix)
+    # logger.info(f'file_outpath:{file_outpath}')
+
+    # ofd.read(ofdb64, save_xml=False, xml_name=f"{file_outpath}_xml")  # 读取ofdb64
+    # pdf_bytes = ofd.to_pdf()  # 转pdf
+    ofd.del_data()
+
+    with open(f"{file_outpath}.pdf", "wb") as f:
+        f.write(pdfbytes)
+    return f"{file_outpath}.pdf"
+
+def ofd2img(file_path,output_dir):
+
+    file_prefix = os.path.splitext(os.path.split(file_path)[1])[0]
+    output_file = os.path.join(output_dir,file_prefix)
+    with open(file_path, "rb") as f:
+        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
+    ofd = OFD()  # 初始化OFD 工具类
+    ofd.read(ofdb64, save_xml=False, xml_name=f"{output_file}_xml")  # 读取ofdb64
+    img_np,pdfbytes = ofd.to_jpg()  # 转图片
+    ofd.del_data()
+    output_files = []
+
+    for idx, img in enumerate(img_np):
+        # im = Image.fromarray(img)
+        img.save(f"{output_file}_{idx}.jpg")
+        output_files.append(f'{output_file}_{idx}.jpg')
+
+    return output_files,pdfbytes
+
+def parse_ofd(config_path,file_path,output_dir):
+    config = configparser.ConfigParser()
+    config.read(config_path)
+    url = config.get('server', 'ocr_server')
+    client = PredictClient(url)
+    ofd_imgs,pdfbytes = ofd2img(file_path,output_dir)
+    # logger.info(f'url:{url}\tofd_img:{ofd_imgs}')
+    text = '判断图片是否是发票，如果是发票精确提取图片中的内容，否则返回False'
+    ofd_txts = ''
+    for ofd_img in ofd_imgs:
+        compress_image(ofd_img)
+        res = client.predict(ofd_img,text)
+        if 'False' in res or 'false' in res:
+            ofd_pdf = ofd2pdf(file_path,output_dir,pdfbytes)
+            logger.info(f'ofd_pdf:{ofd_pdf}')
+            pdf_server = config.get('server', 'pdf_server')
+            pdf_ocr = ocrPdfClient(pdf_server)
+            ofd_txt = pdf_ocr.ocr_pdf_client(path=ofd_pdf, output_dir=output_dir)
+            break
+        else:
+            res = decode_html_entities(res)
+            res = json_to_txt(res)
+            ofd_txts = ofd_txts + res + '\n'
+
+    if ofd_txts != '':
+        file_name = os.path.basename(file_name).split('.')
+        ofd_txt = os.path.join(output_dir,file_name) + '.txt'
+        logger.info(f'ofd_txt:{ofd_txt}')
+        with open(ofd_txt, 'w', encoding='utf-8') as f:
+            f.write(str(ofd_txts))
+
+    return ofd_txt
+#
+# if __name__ == '__main__':
+#     file_path = ''
+#     out_path = ''
+#     ofd2pdf()
--- a/magic_pdf/parse/pdf_client.py
+++ b/magic_pdf/parse/pdf_client.py
+# -*- coding: utf-8 -*-
+import configparser
+import time
+
+import requests
+from loguru import logger
+import argparse
+import os
+
+
+
+class ocrPdfClient:
+    def __init__(self, api_url):
+        self.api_url = api_url
+
+    def check_health(self):
+        health_check_url = f'{self.api_url}/health'
+        try:
+            response = requests.get(health_check_url)
+            if response.status_code == 200:
+                logger.info("Server is healthy and ready to process requests.")
+                return True
+            else:
+                logger.error(f'Server health check failed with status code:{response.status_code}')
+                return False
+        except requests.exceptions.RequestException as e:
+            logger.error(f'Health check request failed:{e}')
+            return False
+
+    def ocr_pdf_client(self, path, output_dir):
+        payload = {
+            "path": str(path),
+            "output_dir": str(output_dir),
+        }
+        # logger.info(f'reading: {path}')
+        try:
+            response = requests.post(f"{self.api_url}/pdf_ocr", json=payload)
+            output_dir = response.json()['output_path']
+            response.raise_for_status()
+            logger.info(f'output_dir:{output_dir}')
+            return output_dir if response.json()['status_code'] == 200 else None
+        except requests.exceptions.RequestException as e:
+            logger.error(f"OCR PDF API request failed: {e}")
+            return None
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--path',
+        '-p',
+        required=True
+        )
+    parser.add_argument(
+        '--output_dir',
+        '-o',
+        required=True
+        )
+    parser.add_argument(
+        '--config_path',
+        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
+        )
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    config = configparser.ConfigParser()
+    config.read(args.config_path)
+    pdf_server = config.get('server', 'pdf_server')
+    embedder = ocrPdfClient(pdf_server)
+    doc_analyze_start = time.time()
+
+    if not os.path.isabs(args.output_dir):
+        current_working_directory = os.getcwd()
+        output_dir = os.path.join(current_working_directory, args.output_dir)
+        # logger.info(f'相对路径output_dir:{output_dir}')
+    else:
+        output_dir = args.output_dir
+    logger.info(f'output_dir:{output_dir}')
+
+
+    try:
+        res = embedder.ocr_pdf_client(path=args.path,output_dir=output_dir)
+        if res:
+            logger.info(f"output_dir: '{res}'")
+        else:
+            logger.warning("None")
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error while making request to reranker service: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error occurred: {e}")
+    doc_analyze_cost = time.time() - doc_analyze_start
+
+    logger.info(f'解析当前pdf{args.path}耗时为:{doc_analyze_cost}')
+
+if __name__ == "__main__":
+    main()
+
+
+