Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files

Update magic_pdf/init.py, magic_pdf/config.ini, magic_pdf/tmp.py,...
Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
2df265c8 · zhougaofeng · 826086d2 · 2df265c8 · 2df265c8 · 2df265c8
Commit 2df265c8 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+
+
+def parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,pdf_bytes,
+                     model_list,
+                     imageWriter,
+                     start_page_id=0,
+                     end_page_id=None,
+                     debug_mode=False,
+                     ):
+    dataset = PymuDocDataset(pdf_bytes)
+    return pdf_parse_union(ocr_status,config_path,local_image_dir,dataset,
+                           model_list,
+                           imageWriter,
+                           SupportedPdfParseMethod.OCR,
+                           start_page_id=start_page_id,
+                           end_page_id=end_page_id,
+                           debug_mode=debug_mode,
+                           )
--- a/magic_pdf/pdf_parse_by_txt.py
+++ b/magic_pdf/pdf_parse_by_txt.py
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+
+
+def parse_pdf_by_txt(
+    pdf_bytes,
+    model_list,
+    imageWriter,
+    start_page_id=0,
+    end_page_id=None,
+    debug_mode=False,
+):
+    dataset = PymuDocDataset(pdf_bytes)
+    return pdf_parse_union(dataset,
+                           model_list,
+                           imageWriter,
+                           SupportedPdfParseMethod.TXT,
+                           start_page_id=start_page_id,
+                           end_page_id=end_page_id,
+                           debug_mode=debug_mode,
+                           )
--- a/magic_pdf/pdf_parse_union_core.py
+++ b/magic_pdf/pdf_parse_union_core.py
+import time
+
+from loguru import logger
+
+from magic_pdf.libs.commons import fitz, get_delta_time
+from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_columns_cnt_of_layout
+from magic_pdf.libs.convert_utils import dict_to_list
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.hash_utils import compute_md5
+from magic_pdf.libs.local_math import float_equal
+from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.para.para_split_v2 import para_split
+from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
+from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
+from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
+from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \
+    combine_chars_to_pymudict
+from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
+from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
+    fix_discarded_block
+from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2, \
+    remove_overlaps_low_confidence_spans
+from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
+
+
+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
+    useful_blocks = []
+    for bbox in all_bboxes:
+        useful_blocks.append({
+            "bbox": bbox[:4]
+        })
+    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks)
+    if is_useful_block_horz_overlap:
+        logger.warning(
+            f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}")
+        for bbox in all_bboxes.copy():
+            if smaller_bbox == bbox[:4]:
+                all_bboxes.remove(bbox)
+
+    return is_useful_block_horz_overlap, all_bboxes
+
+
+def __replace_STX_ETX(text_str:str):
+    """ Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
+Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
+
+    Args:
+        text_str (str): raw text
+
+    Returns:
+        _type_: replaced text
+    """
+    if text_str:
+        s = text_str.replace('\u0002', "'")
+        s = s.replace("\u0003", "'")
+        return s
+    return text_str
+
+
+def txt_spans_extract(pdf_page, inline_equations, interline_equations):
+    text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
+    char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
+        "blocks"
+    ]
+    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
+    text_blocks = replace_equations_in_textblock(
+        text_blocks, inline_equations, interline_equations
+    )
+    text_blocks = remove_citation_marker(text_blocks)
+    text_blocks = remove_chars_in_text_blocks(text_blocks)
+    spans = []
+    for v in text_blocks:
+        for line in v["lines"]:
+            for span in line["spans"]:
+                bbox = span["bbox"]
+                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
+                    continue
+                if span.get('type') not in (ContentType.InlineEquation, ContentType.InterlineEquation):
+                    spans.append(
+                        {
+                            "bbox": list(span["bbox"]),
+                            "content": __replace_STX_ETX(span["text"]),
+                            "type": ContentType.Text,
+                            "score": 1.0,
+                        }
+                    )
+    return spans
+
+
+def replace_text_span(pymu_spans, ocr_spans):
+    return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
+
+
+def parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
+    need_drop = False
+    drop_reason = []
+
+    '''从magic_model对象中获取后面会用到的区块信息'''
+    img_blocks = magic_model.get_imgs(page_id)
+    # img_blocks = []
+    table_blocks = magic_model.get_tables(page_id)
+    discarded_blocks = magic_model.get_discarded(page_id)
+    text_blocks = magic_model.get_text_blocks(page_id)
+    title_blocks = magic_model.get_title_blocks(page_id)
+    inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
+
+    page_w, page_h = magic_model.get_page_size(page_id)
+
+    spans = magic_model.get_all_spans(page_id)
+
+    '''根据parse_mode，构造spans'''
+    if parse_mode == "txt":
+        """ocr 中文本类的 span 用 pymu spans 替换！"""
+        pymu_spans = txt_spans_extract(
+            pdf_docs[page_id], inline_equations, interline_equations
+        )
+        spans = replace_text_span(pymu_spans, spans)
+    elif parse_mode == "ocr":
+        pass
+    else:
+        raise Exception("parse_mode must be txt or ocr")
+
+    '''删除重叠spans中置信度较低的那些'''
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    '''删除重叠spans中较小的那些'''
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+    '''对image和table截图'''
+    spans = ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
+
+    '''将所有区块的bbox整理到一起'''
+    # interline_equation_blocks参数不够准，后面切换到interline_equations上
+    interline_equation_blocks = []
+    if len(interline_equation_blocks) > 0:
+        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equation_blocks, page_w, page_h)
+    else:
+        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equations, page_w, page_h)
+
+    if len(drop_reasons) > 0:
+        need_drop = True
+        drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
+
+    '''先处理不需要排版的discarded_blocks'''
+    discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
+    fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
+
+    '''如果当前页面没有bbox则跳过'''
+    if len(all_bboxes) == 0:
+        logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}")
+        return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
+                                               [], [], interline_equations, fix_discarded_blocks,
+                                               need_drop, drop_reason)
+
+    """在切分之前，先检查一下bbox是否有左右重叠的情况，如果有，那么就认为这个pdf暂时没有能力处理好，这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
+
+    while True:  # 循环检查左右重叠的情况，如果存在就删除掉较小的那个bbox，直到不存在左右重叠的情况
+        is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
+        if is_useful_block_horz_overlap:
+            need_drop = True
+            drop_reason.append(DropReason.USEFUL_BLOCK_HOR_OVERLAP)
+        else:
+            break
+
+    '''根据区块信息计算layout'''
+    page_boundry = [0, 0, page_w, page_h]
+    layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
+
+    if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
+        logger.warning(
+            f"skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
+        need_drop = True
+        drop_reason.append(DropReason.CAN_NOT_DETECT_PAGE_LAYOUT)
+
+    """以下去掉复杂的布局和超过2列的布局"""
+    if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]):  # 复杂的布局
+        logger.warning(
+            f"skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
+        need_drop = True
+        drop_reason.append(DropReason.COMPLICATED_LAYOUT)
+
+    layout_column_width = get_columns_cnt_of_layout(layout_tree)
+    if layout_column_width > 2:  # 去掉超过2列的布局pdf
+        logger.warning(
+            f"skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
+        need_drop = True
+        drop_reason.append(DropReason.TOO_MANY_LAYOUT_COLUMNS)
+
+    '''根据layout顺序，对当前页面所有需要留下的block进行排序'''
+    sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
+
+    '''将span填入排好序的blocks中'''
+    block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3)
+
+    '''对block进行fix操作'''
+    img_blocks = []
+    #logger.info('不获取image板块')
+    fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
+
+    '''获取QA需要外置的list'''
+    images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
+
+    '''构造pdf_info_dict'''
+    page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                                images, tables, interline_equations, fix_discarded_blocks,
+                                                need_drop, drop_reason)
+    return page_info
+
+
+def pdf_parse_union(ocr_status,config_path,local_image_dir,pdf_bytes,
+                    model_list,
+                    imageWriter,
+                    parse_mode,
+                    start_page_id=0,
+                    end_page_id=None,
+                    debug_mode=False,
+                    ):
+    pdf_bytes_md5 = compute_md5(pdf_bytes)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+
+    '''初始化空的pdf_info_dict'''
+    pdf_info_dict = {}
+
+    '''用model_list和docs对象初始化magic_model'''
+    magic_model = MagicModel(model_list, pdf_docs)
+
+    '''根据输入的起始范围解析pdf'''
+    # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1
+
+    if end_page_id > len(pdf_docs) - 1:
+        logger.warning("end_page_id is out of range, use pdf_docs length")
+        end_page_id = len(pdf_docs) - 1
+
+    '''初始化启动时间'''
+    start_time = time.time()
+
+    for page_id, page in enumerate(pdf_docs):
+        '''debug时输出每页解析的耗时'''
+        if debug_mode:
+            time_now = time.time()
+            # logger.info(
+            #     f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
+            # )
+            start_time = time_now
+
+        '''解析pdf中的每一页'''
+        if start_page_id <= page_id <= end_page_id:
+            page_info = parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
+        else:
+            page_w = page.rect.width
+            page_h = page.rect.height
+            page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
+                                                [], [], [], [],
+                                                True, "skip page")
+        pdf_info_dict[f"page_{page_id}"] = page_info
+ #   logger.info(f'pdf_info_dict:\n{pdf_info_dict}')
+    """分段"""
+    para_split(pdf_info_dict, debug_mode=debug_mode)
+
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        "pdf_info": pdf_info_list,
+    }
+
+    return new_pdf_info_dict
+
+
+if __name__ == '__main__':
+    pass
+
+
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
+import copy
+import os
+import statistics
+import time
+from typing import List
+
+import torch
+from loguru import logger
+
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.dataset import Dataset, PageableData
+from magic_pdf.libs.clean_memory import clean_memory
+from magic_pdf.libs.commons import fitz, get_delta_time
+from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
+from magic_pdf.libs.convert_utils import dict_to_list
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.hash_utils import compute_md5
+from magic_pdf.libs.local_math import float_equal
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.para.para_split_v3 import para_split
+from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
+from magic_pdf.pre_proc.construct_page_dict import \
+    ocr_construct_page_component_v2
+from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
+from magic_pdf.pre_proc.equations_replace import (
+    combine_chars_to_pymudict, remove_chars_in_text_blocks,
+    replace_equations_in_textblock)
+from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
+    ocr_prepare_bboxes_for_layout_split_v2
+from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
+                                               fix_block_spans,
+                                               fix_discarded_block, fix_block_spans_v2)
+from magic_pdf.pre_proc.ocr_span_list_modify import (
+    get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
+    remove_overlaps_min_spans)
+from magic_pdf.pre_proc.resolve_bbox_conflict import \
+    check_useful_block_horizontal_overlap
+
+
+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
+    useful_blocks = []
+    for bbox in all_bboxes:
+        useful_blocks.append({'bbox': bbox[:4]})
+    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
+        check_useful_block_horizontal_overlap(useful_blocks)
+    )
+    if is_useful_block_horz_overlap:
+        logger.warning(
+            f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
+        )  # noqa: E501
+        for bbox in all_bboxes.copy():
+            if smaller_bbox == bbox[:4]:
+                all_bboxes.remove(bbox)
+
+    return is_useful_block_horz_overlap, all_bboxes
+
+
+def __replace_STX_ETX(text_str: str):
+    """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
+    Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
+
+        Args:
+            text_str (str): raw text
+
+        Returns:
+            _type_: replaced text
+    """  # noqa: E501
+    if text_str:
+        s = text_str.replace('\u0002', "'")
+        s = s.replace('\u0003', "'")
+        return s
+    return text_str
+
+
+def txt_spans_extract(pdf_page, inline_equations, interline_equations):
+    text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+    char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
+        'blocks'
+    ]
+    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
+    text_blocks = replace_equations_in_textblock(
+        text_blocks, inline_equations, interline_equations
+    )
+    text_blocks = remove_citation_marker(text_blocks)
+    text_blocks = remove_chars_in_text_blocks(text_blocks)
+    spans = []
+    for v in text_blocks:
+        for line in v['lines']:
+            for span in line['spans']:
+                bbox = span['bbox']
+                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
+                    continue
+                if span.get('type') not in (
+                    ContentType.InlineEquation,
+                    ContentType.InterlineEquation,
+                ):
+                    spans.append(
+                        {
+                            'bbox': list(span['bbox']),
+                            'content': __replace_STX_ETX(span['text']),
+                            'type': ContentType.Text,
+                            'score': 1.0,
+                        }
+                    )
+    return spans
+
+
+def replace_text_span(pymu_spans, ocr_spans):
+    return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
+
+
+def model_init(model_name: str):
+    from transformers import LayoutLMv3ForTokenClassification
+
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+        if torch.cuda.is_bf16_supported():
+            supports_bfloat16 = True
+        else:
+            supports_bfloat16 = False
+    else:
+        device = torch.device('cpu')
+        supports_bfloat16 = False
+
+    if model_name == 'layoutreader':
+        # 检测modelscope的缓存目录是否存在
+        layoutreader_model_dir = get_local_layoutreader_model_dir()
+        if os.path.exists(layoutreader_model_dir):
+            model = LayoutLMv3ForTokenClassification.from_pretrained(
+                layoutreader_model_dir
+            )
+        else:
+            logger.warning(
+                'local layoutreader model not exists, use online model from huggingface'
+            )
+            model = LayoutLMv3ForTokenClassification.from_pretrained(
+                'hantian/layoutreader'
+            )
+        # 检查设备是否支持 bfloat16
+        if supports_bfloat16:
+            model.bfloat16()
+        model.to(device).eval()
+    else:
+        logger.error('model name not allow')
+        exit(1)
+    return model
+
+
+class ModelSingleton:
+    _instance = None
+    _models = {}
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def get_model(self, model_name: str):
+        if model_name not in self._models:
+            self._models[model_name] = model_init(model_name=model_name)
+        return self._models[model_name]
+
+
+def do_predict(boxes: List[List[int]], model) -> List[int]:
+    from magic_pdf.model.v3.helpers import (boxes2inputs, parse_logits,
+                                            prepare_inputs)
+
+    inputs = boxes2inputs(boxes)
+    inputs = prepare_inputs(inputs, model)
+    logits = model(**inputs).logits.cpu().squeeze(0)
+    return parse_logits(logits, len(boxes))
+
+
+def cal_block_index(fix_blocks, sorted_bboxes):
+    for block in fix_blocks:
+
+        line_index_list = []
+        if len(block['lines']) == 0:
+            block['index'] = sorted_bboxes.index(block['bbox'])
+        else:
+            for line in block['lines']:
+                line['index'] = sorted_bboxes.index(line['bbox'])
+                line_index_list.append(line['index'])
+            median_value = statistics.median(line_index_list)
+            block['index'] = median_value
+
+        # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
+        if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
+            block['virtual_lines'] = copy.deepcopy(block['lines'])
+            block['lines'] = copy.deepcopy(block['real_lines'])
+            del block['real_lines']
+
+    return fix_blocks
+
+
+def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
+    # block_bbox是一个元组(x0, y0, x1, y1)，其中(x0, y0)是左下角坐标，(x1, y1)是右上角坐标
+    x0, y0, x1, y1 = block_bbox
+
+    block_height = y1 - y0
+    block_weight = x1 - x0
+
+    # 如果block高度小于n行正文，则直接返回block的bbox
+    if line_height * 3 < block_height:
+        if (
+            block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
+        ):  # 可能是双列结构，可以切细点
+            lines = int(block_height / line_height) + 1
+        else:
+            # 如果block的宽度超过0.4页面宽度，则将block分成3行(是一种复杂布局，图不能切的太细)
+            if block_weight > page_w * 0.4:
+                line_height = (y1 - y0) / 3
+                lines = 3
+            elif block_weight > page_w * 0.25:  # （可能是三列结构，也切细点）
+                lines = int(block_height / line_height) + 1
+            else:  # 判断长宽比
+                if block_height / block_weight > 1.2:  # 细长的不分
+                    return [[x0, y0, x1, y1]]
+                else:  # 不细长的还是分成两行
+                    line_height = (y1 - y0) / 2
+                    lines = 2
+
+        # 确定从哪个y位置开始绘制线条
+        current_y = y0
+
+        # 用于存储线条的位置信息[(x0, y), ...]
+        lines_positions = []
+
+        for i in range(lines):
+            lines_positions.append([x0, current_y, x1, current_y + line_height])
+            current_y += line_height
+        return lines_positions
+
+    else:
+        return [[x0, y0, x1, y1]]
+
+
+def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
+    page_line_list = []
+    for block in fix_blocks:
+        if block['type'] in [
+            BlockType.Text, BlockType.Title, BlockType.InterlineEquation,
+            BlockType.ImageCaption, BlockType.ImageFootnote,
+            BlockType.TableCaption, BlockType.TableFootnote
+        ]:
+            if len(block['lines']) == 0:
+                bbox = block['bbox']
+                lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
+                for line in lines:
+                    block['lines'].append({'bbox': line, 'spans': []})
+                page_line_list.extend(lines)
+            else:
+                for line in block['lines']:
+                    bbox = line['bbox']
+                    page_line_list.append(bbox)
+        elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
+            bbox = block['bbox']
+            block["real_lines"] = copy.deepcopy(block['lines'])
+            lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
+            block['lines'] = []
+            for line in lines:
+                block['lines'].append({'bbox': line, 'spans': []})
+            page_line_list.extend(lines)
+
+    # 使用layoutreader排序
+    x_scale = 1000.0 / page_w
+    y_scale = 1000.0 / page_h
+    boxes = []
+    # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
+    for left, top, right, bottom in page_line_list:
+        if left < 0:
+            logger.warning(
+                f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
+            )  # noqa: E501
+            left = 0
+        if right > page_w:
+            logger.warning(
+                f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
+            )  # noqa: E501
+            right = page_w
+        if top < 0:
+            logger.warning(
+                f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
+            )  # noqa: E501
+            top = 0
+        if bottom > page_h:
+            logger.warning(
+                f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
+            )  # noqa: E501
+            bottom = page_h
+
+        left = round(left * x_scale)
+        top = round(top * y_scale)
+        right = round(right * x_scale)
+        bottom = round(bottom * y_scale)
+        assert (
+            1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
+        ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}'  # noqa: E126, E121
+        boxes.append([left, top, right, bottom])
+    model_manager = ModelSingleton()
+    model = model_manager.get_model('layoutreader')
+    with torch.no_grad():
+        orders = do_predict(boxes, model)
+    sorted_bboxes = [page_line_list[i] for i in orders]
+
+    return sorted_bboxes
+
+
+def get_line_height(blocks):
+    page_line_height_list = []
+    for block in blocks:
+        if block['type'] in [
+            BlockType.Text, BlockType.Title,
+            BlockType.ImageCaption, BlockType.ImageFootnote,
+            BlockType.TableCaption, BlockType.TableFootnote
+        ]:
+            for line in block['lines']:
+                bbox = line['bbox']
+                page_line_height_list.append(int(bbox[3] - bbox[1]))
+    if len(page_line_height_list) > 0:
+        return statistics.median(page_line_height_list)
+    else:
+        return 10
+
+
+def process_groups(groups, body_key, caption_key, footnote_key):
+    body_blocks = []
+    caption_blocks = []
+    footnote_blocks = []
+    for i, group in enumerate(groups):
+        group[body_key]['group_id'] = i
+        body_blocks.append(group[body_key])
+        for caption_block in group[caption_key]:
+            caption_block['group_id'] = i
+            caption_blocks.append(caption_block)
+        for footnote_block in group[footnote_key]:
+            footnote_block['group_id'] = i
+            footnote_blocks.append(footnote_block)
+    return body_blocks, caption_blocks, footnote_blocks
+
+
+def process_block_list(blocks, body_type, block_type):
+    indices = [block['index'] for block in blocks]
+    median_index = statistics.median(indices)
+
+    body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), [])
+
+    return {
+        'type': block_type,
+        'bbox': body_bbox,
+        'blocks': blocks,
+        'index': median_index,
+    }
+
+
+def revert_group_blocks(blocks):
+    image_groups = {}
+    table_groups = {}
+    new_blocks = []
+    for block in blocks:
+        if block['type'] in [BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote]:
+            group_id = block['group_id']
+            if group_id not in image_groups:
+                image_groups[group_id] = []
+            image_groups[group_id].append(block)
+        elif block['type'] in [BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote]:
+            group_id = block['group_id']
+            if group_id not in table_groups:
+                table_groups[group_id] = []
+            table_groups[group_id].append(block)
+        else:
+            new_blocks.append(block)
+
+    for group_id, blocks in image_groups.items():
+        new_blocks.append(process_block_list(blocks, BlockType.ImageBody, BlockType.Image))
+
+    for group_id, blocks in table_groups.items():
+        new_blocks.append(process_block_list(blocks, BlockType.TableBody, BlockType.Table))
+
+    return new_blocks
+
+
+def parse_page_core(ocr_status,config_path,local_image_dir,
+    page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
+):
+    need_drop = False
+    drop_reason = []
+
+    """从magic_model对象中获取后面会用到的区块信息"""
+    # img_blocks = magic_model.get_imgs(page_id)
+    # table_blocks = magic_model.get_tables(page_id)
+
+    img_groups = magic_model.get_imgs_v2(page_id)
+    table_groups = magic_model.get_tables_v2(page_id)
+
+    img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
+        img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
+    )
+
+    table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
+        table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
+    )
+
+    discarded_blocks = magic_model.get_discarded(page_id)
+    text_blocks = magic_model.get_text_blocks(page_id)
+    title_blocks = magic_model.get_title_blocks(page_id)
+    inline_equations, interline_equations, interline_equation_blocks = (
+        magic_model.get_equations(page_id)
+    )
+
+    page_w, page_h = magic_model.get_page_size(page_id)
+
+    spans = magic_model.get_all_spans(page_id)
+
+    """根据parse_mode，构造spans"""
+    if parse_mode == SupportedPdfParseMethod.TXT:
+        """ocr 中文本类的 span 用 pymu spans 替换！"""
+        pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
+        spans = replace_text_span(pymu_spans, spans)
+    elif parse_mode == SupportedPdfParseMethod.OCR:
+        pass
+    else:
+        raise Exception('parse_mode must be txt or ocr')
+
+    """删除重叠spans中置信度较低的那些"""
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    """删除重叠spans中较小的那些"""
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+    """对image和table截图"""
+    spans = ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,
+        spans, page_doc, page_id, pdf_bytes_md5, imageWriter
+    )
+
+    """将所有区块的bbox整理到一起"""
+    # interline_equation_blocks参数不够准，后面切换到interline_equations上
+    interline_equation_blocks = []
+    if len(interline_equation_blocks) > 0:
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
+            img_body_blocks, img_caption_blocks, img_footnote_blocks,
+            table_body_blocks, table_caption_blocks, table_footnote_blocks,
+            discarded_blocks,
+            text_blocks,
+            title_blocks,
+            interline_equation_blocks,
+            page_w,
+            page_h,
+        )
+    else:
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
+            img_body_blocks, img_caption_blocks, img_footnote_blocks,
+            table_body_blocks, table_caption_blocks, table_footnote_blocks,
+            discarded_blocks,
+            text_blocks,
+            title_blocks,
+            interline_equations,
+            page_w,
+            page_h,
+        )
+
+    """先处理不需要排版的discarded_blocks"""
+    discarded_block_with_spans, spans = fill_spans_in_blocks(
+        all_discarded_blocks, spans, 0.4
+    )
+    fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
+
+    """如果当前页面没有bbox则跳过"""
+    if len(all_bboxes) == 0:
+        logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
+        return ocr_construct_page_component_v2(
+            [],
+            [],
+            page_id,
+            page_w,
+            page_h,
+            [],
+            [],
+            [],
+            interline_equations,
+            fix_discarded_blocks,
+            need_drop,
+            drop_reason,
+        )
+
+    """将span填入blocks中"""
+    block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
+
+    """对block进行fix操作"""
+    fix_blocks = fix_block_spans_v2(block_with_spans)
+
+    """获取所有line并计算正文line的高度"""
+    line_height = get_line_height(fix_blocks)
+
+    """获取所有line并对line排序"""
+    sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
+
+    """根据line的中位数算block的序列关系"""
+    fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
+
+    """将image和table的block还原回group形式参与后续流程"""
+    fix_blocks = revert_group_blocks(fix_blocks)
+
+    """重排block"""
+    sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
+
+    """获取QA需要外置的list"""
+    images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
+
+    """构造pdf_info_dict"""
+    page_info = ocr_construct_page_component_v2(
+        sorted_blocks,
+        [],
+        page_id,
+        page_w,
+        page_h,
+        [],
+        images,
+        tables,
+        interline_equations,
+        fix_discarded_blocks,
+        need_drop,
+        drop_reason,
+    )
+    return page_info
+
+
+def pdf_parse_union(ocr_status,config_path,local_image_dir,
+    dataset: Dataset,
+    model_list,
+    imageWriter,
+    parse_mode,
+    start_page_id=0,
+    end_page_id=None,
+    debug_mode=False,
+):
+    pdf_bytes_md5 = compute_md5(dataset.data_bits())
+
+    """初始化空的pdf_info_dict"""
+    pdf_info_dict = {}
+
+    """用model_list和docs对象初始化magic_model"""
+    magic_model = MagicModel(model_list, dataset)
+
+    """根据输入的起始范围解析pdf"""
+    # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+    end_page_id = (
+        end_page_id
+        if end_page_id is not None and end_page_id >= 0
+        else len(dataset) - 1
+    )
+
+    if end_page_id > len(dataset) - 1:
+        logger.warning('end_page_id is out of range, use pdf_docs length')
+        end_page_id = len(dataset) - 1
+
+    """初始化启动时间"""
+    start_time = time.time()
+
+    for page_id, page in enumerate(dataset):
+        """debug时输出每页解析的耗时."""
+        if debug_mode:
+            time_now = time.time()
+            # logger.info(
+            #     f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
+            # )
+            start_time = time_now
+
+        """解析pdf中的每一页"""
+        if start_page_id <= page_id <= end_page_id:
+            page_info = parse_page_core(
+                ocr_status,config_path,local_image_dir,page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
+            )
+        else:
+            page_info = page.get_page_info()
+            page_w = page_info.w
+            page_h = page_info.h
+            page_info = ocr_construct_page_component_v2(
+                [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
+            )
+        pdf_info_dict[f'page_{page_id}'] = page_info
+
+    """分段"""
+    para_split(pdf_info_dict, debug_mode=debug_mode)
+
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        'pdf_info': pdf_info_list,
+    }
+
+    clean_memory()
+
+    return new_pdf_info_dict
+
+
+if __name__ == '__main__':
+    pass
--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
+from abc import ABC, abstractmethod
+
+from magic_pdf.dict2md.ocr_mkcontent import union_make
+from magic_pdf.filter.pdf_classify_by_type import classify
+from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
+from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.json_compressor import JsonCompressor
+
+
+class AbsPipe(ABC):
+    """
+    txt和ocr处理的抽象类
+    """
+    PIP_OCR = "ocr"
+    PIP_TXT = "txt"
+
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
+        self.pdf_bytes = pdf_bytes
+        self.model_list = model_list
+        self.image_writer = image_writer
+        self.pdf_mid_data = None  # 未压缩
+        self.is_debug = is_debug
+        self.start_page_id = start_page_id
+        self.end_page_id = end_page_id
+        self.lang = lang
+        self.layout_model = layout_model
+        self.formula_enable = formula_enable
+        self.table_enable = table_enable
+    
+    def get_compress_pdf_mid_data(self):
+        return JsonCompressor.compress_json(self.pdf_mid_data)
+
+    @abstractmethod
+    def pipe_classify(self):
+        """
+        有状态的分类
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def pipe_analyze(self):
+        """
+        有状态的跑模型分析
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def pipe_parse(self):
+        """
+        有状态的解析
+        """
+        raise NotImplementedError
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
+        return content_list
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
+        return md_content
+
+    @staticmethod
+    def classify(pdf_bytes: bytes) -> str:
+        """
+        根据pdf的元数据，判断是文本pdf，还是ocr pdf
+        """
+        pdf_meta = pdf_meta_scan(pdf_bytes)
+        if pdf_meta.get("_need_drop", False):  # 如果返回了需要丢弃的标志，则抛出异常
+            raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
+        else:
+            is_encrypted = pdf_meta["is_encrypted"]
+            is_needs_password = pdf_meta["is_needs_password"]
+            if is_encrypted or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
+                raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
+            else:
+                is_text_pdf, results = classify(
+                    pdf_meta["total_page"],
+                    pdf_meta["page_width_pts"],
+                    pdf_meta["page_height_pts"],
+                    pdf_meta["image_info_per_page"],
+                    pdf_meta["text_len_per_page"],
+                    pdf_meta["imgs_per_page"],
+                    pdf_meta["text_layout_per_page"],
+                    pdf_meta["invalid_chars"],
+                )
+                if is_text_pdf:
+                    return AbsPipe.PIP_TXT
+                else:
+                    return AbsPipe.PIP_OCR
+
+    @staticmethod
+    def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
+        """
+        根据pdf类型，生成统一格式content_list
+        """
+        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
+        pdf_info_list = pdf_mid_data["pdf_info"]
+        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
+        return content_list
+
+    @staticmethod
+    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
+        """
+        根据pdf类型，markdown
+        """
+        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
+        pdf_info_list = pdf_mid_data["pdf_info"]
+        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
+        return md_content
+
+
--- a/magic_pdf/pipe/OCRPipe.py
+++ b/magic_pdf/pipe/OCRPipe.py
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.pipe.AbsPipe import AbsPipe
+from magic_pdf.user_api import parse_ocr_pdf
+
+
+class OCRPipe(AbsPipe):
+
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None, lang=None,
+                 layout_model=None, formula_enable=None, table_enable=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
+                         layout_model, formula_enable, table_enable)
+
+    def pipe_classify(self):
+        pass
+
+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                      lang=self.lang, layout_model=self.layout_model,
+                                      formula_enable=self.formula_enable, table_enable=self.table_enable)
+
+    def pipe_parse(self):
+        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang, layout_model=self.layout_model,
+                                          formula_enable=self.formula_enable, table_enable=self.table_enable)
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("ocr_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"ocr_pipe mk {md_make_mode} finished")
+        return result
--- a/magic_pdf/pipe/TXTPipe.py
+++ b/magic_pdf/pipe/TXTPipe.py
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.json_compressor import JsonCompressor
+from magic_pdf.pipe.AbsPipe import AbsPipe
+from magic_pdf.user_api import parse_txt_pdf
+
+
+class TXTPipe(AbsPipe):
+
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None, lang=None,
+                 layout_model=None, formula_enable=None, table_enable=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
+                         layout_model, formula_enable, table_enable)
+
+    def pipe_classify(self):
+        pass
+
+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                      lang=self.lang, layout_model=self.layout_model,
+                                      formula_enable=self.formula_enable, table_enable=self.table_enable)
+
+    def pipe_parse(self):
+        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang, layout_model=self.layout_model,
+                                          formula_enable=self.formula_enable, table_enable=self.table_enable)
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("txt_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"txt_pipe mk {md_make_mode} finished")
+        return result
--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
+import json
+
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.libs.commons import join_path
+from magic_pdf.pipe.AbsPipe import AbsPipe
+from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
+
+
+class UNIPipe(AbsPipe):
+
+    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None, lang=None,
+                 layout_model=None, formula_enable=None, table_enable=None):
+        self.pdf_type = jso_useful_key["_pdf_type"]
+        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id,
+                         lang, layout_model, formula_enable, table_enable)
+        if len(self.model_list) == 0:
+            self.input_model_is_empty = True
+        else:
+            self.input_model_is_empty = False
+
+    def pipe_classify(self):
+        # self.pdf_type = AbsPipe.classify(self.pdf_bytes)
+        # logger.info('暂时强制使用ocr pdf')
+        self.pdf_type = self.PIP_OCR
+
+    def pipe_analyze(self,model):
+        if self.pdf_type == self.PIP_TXT:
+            self.model_list = doc_analyze(model,self.pdf_bytes, ocr=False,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang, layout_model=self.layout_model,
+                                          formula_enable=self.formula_enable, table_enable=self.table_enable)
+        elif self.pdf_type == self.PIP_OCR:
+            self.model_list = doc_analyze(model,self.pdf_bytes, ocr=True,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang, layout_model=self.layout_model,
+                                          formula_enable=self.formula_enable, table_enable=self.table_enable)
+
+    def pipe_parse(self,ocr_status,config_path,local_image_dir):
+        if self.pdf_type == self.PIP_TXT:
+            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
+                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
+                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                                lang=self.lang, layout_model=self.layout_model,
+                                                formula_enable=self.formula_enable, table_enable=self.table_enable)
+        elif self.pdf_type == self.PIP_OCR:
+            self.pdf_mid_data = parse_ocr_pdf(ocr_status,config_path,local_image_dir,self.pdf_bytes, self.model_list, self.image_writer,
+                                              is_debug=self.is_debug,
+                                              start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                              lang=self.lang)
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("uni_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"uni_pipe mk {md_make_mode} finished")
+        return result
+
+
+if __name__ == '__main__':
+    # 测试
+    drw = DiskReaderWriter(r"D:/project/20231108code-clean")
+
+    pdf_file_path = r"linshixuqiu\19983-00.pdf"
+    model_file_path = r"linshixuqiu\19983-00.json"
+    pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
+    model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
+    model_list = json.loads(model_json_txt)
+    write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
+    img_bucket_path = "imgs"
+    img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
+
+    # pdf_type = UNIPipe.classify(pdf_bytes)
+    # jso_useful_key = {
+    #     "_pdf_type": pdf_type,
+    #     "model_list": model_list
+    # }
+
+    jso_useful_key = {
+        "_pdf_type": "",
+        "model_list": model_list
+    }
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
+    pipe.pipe_classify()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown(img_bucket_path)
+    content_list = pipe.pipe_mk_uni_format(img_bucket_path)
+
+    md_writer = DiskReaderWriter(write_path)
+    md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)
+    md_writer.write(json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), "19983-00.json",
+                    AbsReaderWriter.MODE_TXT)
+    md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
--- a/magic_pdf/pipe/__init__.py
+++ b/magic_pdf/pipe/__init__.py
--- a/magic_pdf/post_proc/__init__.py
+++ b/magic_pdf/post_proc/__init__.py
--- a/magic_pdf/post_proc/detect_para.py
+++ b/magic_pdf/post_proc/detect_para.py
+import os
+import sys
+import json
+import re
+import math
+import unicodedata
+from collections import Counter
+
+
+import numpy as np
+from termcolor import cprint
+
+
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.nlp_utils import NLPModels
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+def open_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)  # type: ignore
+        return pdf_document
+    except Exception as e:
+        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
+        raise e
+
+
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+
+
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+
+
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+
+
+def print_yellow(text):
+    print()
+    cprint(text, "yellow", attrs=["bold"], end="\n\n")
+
+
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+
+
+def is_bbox_overlap(bbox1, bbox2):
+    """
+    This function checks if bbox1 and bbox2 overlap or not
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 and bbox2 overlap, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 > x1_2 or x0_2 > x1_1:
+        return False
+    if y0_1 > y1_2 or y0_2 > y1_1:
+        return False
+
+    return True
+
+
+def is_in_bbox(bbox1, bbox2):
+    """
+    This function checks if bbox1 is in bbox2
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 is in bbox2, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
+        return True
+    else:
+        return False
+
+
+def calculate_para_bbox(lines):
+    """
+    This function calculates the minimum bbox of the paragraph
+
+    Parameters
+    ----------
+    lines : list
+        lines
+
+    Returns
+    -------
+    para_bbox : list
+        bbox of the paragraph
+    """
+    x0 = min(line["bbox"][0] for line in lines)
+    y0 = min(line["bbox"][1] for line in lines)
+    x1 = max(line["bbox"][2] for line in lines)
+    y1 = max(line["bbox"][3] for line in lines)
+    return [x0, y0, x1, y1]
+
+
+def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is right aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is right aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    _, _, x1, _ = curr_line_bbox
+    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x1 - prev_x1) < horizontal_thres
+    elif direction == 1:
+        return abs(x1 - next_x1) < horizontal_thres
+    elif direction == 2:
+        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
+    else:
+        return False
+
+
+def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is left aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is left aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    x0, _, _, _ = curr_line_bbox
+    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x0 - prev_x0) < horizontal_thres
+    elif direction == 1:
+        return abs(x0 - next_x0) < horizontal_thres
+    elif direction == 2:
+        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
+    else:
+        return False
+
+
+def end_with_punctuation(line_text):
+    """
+    This function checks if the line ends with punctuation marks
+    """
+
+    english_end_puncs = [".", "?", "!"]
+    chinese_end_puncs = ["。", "？", "！"]
+    end_puncs = english_end_puncs + chinese_end_puncs
+
+    last_non_space_char = None
+    for ch in line_text[::-1]:
+        if not ch.isspace():
+            last_non_space_char = ch
+            break
+
+    if last_non_space_char is None:
+        return False
+
+    return last_non_space_char in end_puncs
+
+
+def is_nested_list(lst):
+    if isinstance(lst, list):
+        return any(isinstance(sub, list) for sub in lst)
+    return False
+
+
+class DenseSingleLineBlockException(Exception):
+    """
+    This class defines the exception type for dense single line-block.
+    """
+
+    def __init__(self, message="DenseSingleLineBlockException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleDetectionException(Exception):
+    """
+    This class defines the exception type for title detection.
+    """
+
+    def __init__(self, message="TitleDetectionException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleLevelException(Exception):
+    """
+    This class defines the exception type for title level.
+    """
+
+    def __init__(self, message="TitleLevelException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaSplitException(Exception):
+    """
+    This class defines the exception type for paragraph splitting.
+    """
+
+    def __init__(self, message="ParaSplitException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaMergeException(Exception):
+    """
+    This class defines the exception type for paragraph merging.
+    """
+
+    def __init__(self, message="ParaMergeException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class DiscardByException:
+    """
+    This class discards pdf files by exception
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
+        """
+        This function discards pdf files by single line block exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        exception_page_nums = 0
+        page_num = 0
+        for page_id, page in pdf_dic.items():
+            if page_id.startswith("page_"):
+                page_num += 1
+                if "preproc_blocks" in page.keys():
+                    preproc_blocks = page["preproc_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in preproc_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
+                        exception_page_nums += 1
+
+        if page_num == 0:
+            return None
+
+        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
+            return exception.message
+
+        return None
+
+    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
+        """
+        This function discards pdf files by title detection exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
+        """
+        This function discards pdf files by title level exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
+        """
+        This function discards pdf files by split para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
+        """
+        This function discards pdf files by merge para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+
+class LayoutFilterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def batch_process_blocks(self, pdf_dict):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        pdf_dict : dict
+            pdf dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            pdf dictionary
+        """
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
+                    layout_bbox_objs = blocks["layout_bboxes"]
+                    if layout_bbox_objs is None:
+                        continue
+                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
+
+                    # Enlarge each value of x0, y0, x1, y1 for each layout_bbox to prevent loss of text.
+                    layout_bboxes = [
+                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
+                    ]
+
+                    para_blocks = blocks["para_blocks"]
+                    if para_blocks is None:
+                        continue
+
+                    for lb_bbox in layout_bboxes:
+                        for i, para_block in enumerate(para_blocks):
+                            para_bbox = para_block["bbox"]
+                            para_blocks[i]["in_layout"] = 0
+                            if is_in_bbox(para_bbox, lb_bbox):
+                                para_blocks[i]["in_layout"] = 1
+
+                    blocks["para_blocks"] = para_blocks
+
+        return pdf_dict
+
+
+class RawBlockProcessor:
+    def __init__(self) -> None:
+        self.y_tolerance = 2
+        self.pdf_dic = {}
+
+    def __span_flags_decomposer(self, span_flags):
+        """
+        Make font flags human readable.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        span_flags : int
+            span flags
+
+        Returns
+        -------
+        l : dict
+            decomposed flags
+        """
+
+        l = {
+            "is_superscript": False,
+            "is_italic": False,
+            "is_serifed": False,
+            "is_sans_serifed": False,
+            "is_monospaced": False,
+            "is_proportional": False,
+            "is_bold": False,
+        }
+
+        if span_flags & 2**0:
+            l["is_superscript"] = True  # 表示上标
+
+        if span_flags & 2**1:
+            l["is_italic"] = True  # 表示斜体
+
+        if span_flags & 2**2:
+            l["is_serifed"] = True  # 表示衬线字体
+        else:
+            l["is_sans_serifed"] = True  # 表示非衬线字体
+
+        if span_flags & 2**3:
+            l["is_monospaced"] = True  # 表示等宽字体
+        else:
+            l["is_proportional"] = True  # 表示比例字体
+
+        if span_flags & 2**4:
+            l["is_bold"] = True  # 表示粗体
+
+        return l
+
+    def __make_new_lines(self, raw_lines):
+        """
+        This function makes new lines.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        raw_lines : list
+            raw lines
+
+        Returns
+        -------
+        new_lines : list
+            new lines
+        """
+        new_lines = []
+        new_line = None
+
+        for raw_line in raw_lines:
+            raw_line_bbox = raw_line["bbox"]
+            raw_line_spans = raw_line["spans"]
+            raw_line_text = "".join([span["text"] for span in raw_line_spans])
+            raw_line_dir = raw_line.get("dir", None)
+
+            decomposed_line_spans = []
+            for span in raw_line_spans:
+                raw_flags = span["flags"]
+                decomposed_flags = self.__span_flags_decomposer(raw_flags)
+                span["decomposed_flags"] = decomposed_flags
+                decomposed_line_spans.append(span)
+
+            if new_line is None:  # Handle the first line
+                new_line = {
+                    "bbox": raw_line_bbox,
+                    "text": raw_line_text,
+                    "dir": raw_line_dir if raw_line_dir else (0, 0),
+                    "spans": decomposed_line_spans,
+                }
+            else:  # Handle the rest lines
+                if (
+                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
+                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
+                ):
+                    new_line["bbox"] = (
+                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
+                        new_line["bbox"][1],  # top
+                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
+                        raw_line_bbox[3],  # bottom
+                    )
+                    new_line["text"] += raw_line_text
+                    new_line["spans"].extend(raw_line_spans)
+                    new_line["dir"] = (
+                        new_line["dir"][0] + raw_line_dir[0],
+                        new_line["dir"][1] + raw_line_dir[1],
+                    )
+                else:
+                    new_lines.append(new_line)
+                    new_line = {
+                        "bbox": raw_line_bbox,
+                        "text": raw_line_text,
+                        "dir": raw_line_dir if raw_line_dir else (0, 0),
+                        "spans": raw_line_spans,
+                    }
+        if new_line:
+            new_lines.append(new_line)
+
+        return new_lines
+
+    def __make_new_block(self, raw_block):
+        """
+        This function makes a new block.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        raw_block : dict
+            a raw block
+
+        Returns
+        -------
+        new_block : dict
+        """
+        new_block = {}
+
+        block_id = raw_block["number"]
+        block_bbox = raw_block["bbox"]
+        block_text = "".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
+        raw_lines = raw_block["lines"]
+        block_lines = self.__make_new_lines(raw_lines)
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["lines"] = block_lines
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "preproc_blocks" in blocks.keys():
+                    input_blocks = blocks["preproc_blocks"]
+                    for raw_block in input_blocks:
+                        new_block = self.__make_new_block(raw_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
+
+class BlockStatisticsCalculator:
+    """
+    This class calculates the statistics of the block.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __calc_stats_of_new_lines(self, new_lines):
+        """
+        This function calculates the paragraph metrics
+
+        Parameters
+        ----------
+        combined_lines : list
+            combined lines
+
+        Returns
+        -------
+        X0 : float
+            Median of x0 values, which represents the left average boundary of the block
+        X1 : float
+            Median of x1 values, which represents the right average boundary of the block
+        avg_char_width : float
+            Average of char widths, which represents the average char width of the block
+        avg_char_height : float
+            Average of line heights, which represents the average line height of the block
+
+        """
+        x0_values = []
+        x1_values = []
+        char_widths = []
+        char_heights = []
+
+        block_font_types = []
+        block_font_sizes = []
+        block_directions = []
+
+        if len(new_lines) > 0:
+            for i, line in enumerate(new_lines):
+                line_bbox = line["bbox"]
+                line_text = line["text"]
+                line_spans = line["spans"]
+
+                num_chars = len([ch for ch in line_text if not ch.isspace()])
+
+                x0_values.append(line_bbox[0])
+                x1_values.append(line_bbox[2])
+
+                if num_chars > 0:
+                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
+                    char_widths.append(char_width)
+
+                for span in line_spans:
+                    block_font_types.append(span["font"])
+                    block_font_sizes.append(span["size"])
+
+                if "dir" in line:
+                    block_directions.append(line["dir"])
+
+                # line_font_types = [span["font"] for span in line_spans]
+                char_heights = [span["size"] for span in line_spans]
+
+        X0 = np.median(x0_values) if x0_values else 0
+        X1 = np.median(x1_values) if x1_values else 0
+        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
+        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
+
+        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
+
+        max_span_length = 0
+        max_span_font_type = None
+        for line in new_lines:
+            line_spans = line["spans"]
+            for span in line_spans:
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_span_length:
+                    max_span_length = span_length
+                    max_span_font_type = span["font"]
+
+        max_freq_font_type = max_span_font_type
+
+        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
+
+        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
+
+        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
+
+        return (
+            X0,
+            X1,
+            avg_char_width,
+            avg_char_height,
+            max_freq_font_type,
+            avg_font_size,
+            (avg_dir_horizontal, avg_dir_vertical),
+            median_font_size,
+        )
+
+    def __make_new_block(self, input_block):
+        new_block = {}
+
+        raw_lines = input_block["lines"]
+        stats = self.__calc_stats_of_new_lines(raw_lines)
+
+        block_id = input_block["block_id"]
+        block_bbox = input_block["bbox"]
+        block_text = input_block["text"]
+        block_lines = raw_lines
+        block_avg_left_boundary = stats[0]
+        block_avg_right_boundary = stats[1]
+        block_avg_char_width = stats[2]
+        block_avg_char_height = stats[3]
+        block_font_type = stats[4]
+        block_font_size = stats[5]
+        block_direction = stats[6]
+        block_median_font_size = stats[7]
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["dir"] = block_direction
+        new_block["X0"] = block_avg_left_boundary
+        new_block["X1"] = block_avg_right_boundary
+        new_block["avg_char_width"] = block_avg_char_width
+        new_block["avg_char_height"] = block_avg_char_height
+        new_block["block_font_type"] = block_font_type
+        new_block["block_font_size"] = block_font_size
+        new_block["lines"] = block_lines
+        new_block["median_font_size"] = block_median_font_size
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+            Schema can refer to the value of key ""preproc_blocks".
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self.__make_new_block(input_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
+
+class DocStatisticsCalculator:
+    """
+    This class calculates the statistics of the document.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def calc_stats_of_doc(self, pdf_dict):
+        """
+        This function computes the statistics of the document
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        statistics : dict
+            statistics of the document
+        """
+
+        total_text_length = 0
+        total_num_blocks = 0
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    for para_block in para_blocks:
+                        total_text_length += len(para_block["text"])
+                        total_num_blocks += 1
+
+        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
+
+        font_list = []
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        block_text_length = len(input_block.get("text", ""))
+                        if block_text_length < avg_text_length * 0.5:
+                            continue
+                        block_font_type = safe_get(input_block, "block_font_type", "")
+                        block_font_size = safe_get(input_block, "block_font_size", 0)
+                        font_list.append((block_font_type, block_font_size))
+
+        font_counter = Counter(font_list)
+        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
+        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
+
+        statistics = {
+            "num_pages": 0,
+            "num_blocks": 0,
+            "num_paras": 0,
+            "num_titles": 0,
+            "num_header_blocks": 0,
+            "num_footer_blocks": 0,
+            "num_watermark_blocks": 0,
+            "num_vertical_margin_note_blocks": 0,
+            "most_common_font_type": most_common_font[0][0],
+            "most_common_font_size": most_common_font[0][1],
+            "number_of_most_common_font": most_common_font[1],
+            "second_most_common_font_type": second_most_common_font[0][0],
+            "second_most_common_font_size": second_most_common_font[0][1],
+            "number_of_second_most_common_font": second_most_common_font[1],
+            "avg_text_length": avg_text_length,
+        }
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                blocks = pdf_dict[page_id]["para_blocks"]
+                statistics["num_pages"] += 1
+                for block_id, block_data in enumerate(blocks):
+                    statistics["num_blocks"] += 1
+
+                    if "paras" in block_data.keys():
+                        statistics["num_paras"] += len(block_data["paras"])
+
+                    for line in block_data["lines"]:
+                        if line.get("is_title", 0):
+                            statistics["num_titles"] += 1
+
+                    if block_data.get("is_header", 0):
+                        statistics["num_header_blocks"] += 1
+                    if block_data.get("is_footer", 0):
+                        statistics["num_footer_blocks"] += 1
+                    if block_data.get("is_watermark", 0):
+                        statistics["num_watermark_blocks"] += 1
+                    if block_data.get("is_vertical_margin_note", 0):
+                        statistics["num_vertical_margin_note_blocks"] += 1
+
+        pdf_dict["statistics"] = statistics
+
+        return pdf_dict
+
+
+class TitleProcessor:
+    """
+    This class processes the title.
+    """
+
+    def __init__(self, *doc_statistics) -> None:
+        if len(doc_statistics) > 0:
+            self.doc_statistics = doc_statistics[0]
+
+        self.nlp_model = NLPModels()
+        self.MAX_TITLE_LEVEL = 3
+        self.numbered_title_pattern = r"""
+            ^                                 # 行首
+            (                                 # 开始捕获组
+                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
+                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
+                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
+                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
+                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
+                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
+                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
+                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
+                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
+                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
+                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
+                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
+                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
+                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
+                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
+                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
+                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
+            )                                 # 结束捕获组
+            .+                                # 标题的其余部分
+        """
+
+    def _is_potential_title(
+        self,
+        curr_line,
+        prev_line,
+        prev_line_is_title,
+        next_line,
+        avg_char_width,
+        avg_char_height,
+        median_font_size,
+    ):
+        """
+        This function checks if the line is a potential title.
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        avg_char_width : float
+            average of char widths
+        avg_char_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a potential title, False otherwise.
+        """
+
+        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
+            """
+            This function checks if the line is centered on the page
+
+            Parameters
+            ----------
+            line_bbox : list
+                bbox of the line
+            page_bbox : list
+                bbox of the page
+            avg_char_width : float
+                average of char widths
+
+            Returns
+            -------
+            bool
+                True if the line is centered on the page, False otherwise.
+            """
+            horizontal_ratio = 0.5
+            horizontal_thres = horizontal_ratio * avg_char_width
+
+            x0, _, x1, _ = line_bbox
+            _, _, page_x1, _ = page_bbox
+
+            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
+
+        def __is_bold_font_line(line):
+            """
+            Check if a line contains any bold font style.
+            """
+
+            def _is_bold_span(span):
+                # if span text is empty or only contains space, return False
+                if not span["text"].strip():
+                    return False
+
+                return bool(span["flags"] & 2**4)  # Check if the font is bold
+
+            for span in line["spans"]:
+                if not _is_bold_span(span):
+                    return False
+
+            return True
+
+        def __is_italic_font_line(line):
+            """
+            Check if a line contains any italic font style.
+            """
+
+            def __is_italic_span(span):
+                return bool(span["flags"] & 2**1)  # Check if the font is italic
+
+            for span in line["spans"]:
+                if not __is_italic_span(span):
+                    return False
+
+            return True
+
+        def __is_punctuation_heavy(line_text):
+            """
+            Check if the line contains a high ratio of punctuation marks, which may indicate
+            that the line is not a title.
+
+            Parameters:
+            line_text (str): Text of the line.
+
+            Returns:
+            bool: True if the line is heavy with punctuation, False otherwise.
+            """
+            # Pattern for common title format like "X.Y. Title"
+            pattern = r"\b\d+\.\d+\..*\b"
+
+            # If the line matches the title format, return False
+            if re.match(pattern, line_text.strip()):
+                return False
+
+            # Find all punctuation marks in the line
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.1:
+                return True
+
+            return False
+
+        def __has_mixed_font_styles(spans, strict_mode=False):
+            """
+            This function checks if the line has mixed font styles, the strict mode will compare the font types
+
+            Parameters
+            ----------
+            spans : list
+                spans of the line
+            strict_mode : bool
+                True for strict mode, the font types will be fully compared
+                False for non-strict mode, the font types will be compared by the most longest common prefix
+
+            Returns
+            -------
+            bool
+                True if the line has mixed font styles, False otherwise.
+            """
+            if strict_mode:
+                font_styles = set()
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.add(font_style)
+
+                return len(font_styles) > 1
+
+            else:  # non-strict mode
+                font_styles = []
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.append(font_style)
+
+                if len(font_styles) > 1:
+                    longest_common_prefix = os.path.commonprefix(font_styles)
+                    if len(longest_common_prefix) > 0:
+                        return False
+                    else:
+                        return True
+                else:
+                    return False
+
+        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
+            """
+            This function checks if the current line has a different font type from the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            prev_line_font_type : str
+                font type of the previous line
+            next_line_font_type : str
+                font type of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a different font type from the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_type != other_font_type.lower()
+                for other_font_type in [prev_line_font_type, next_line_font_type]
+                if other_font_type is not None
+            )
+
+        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
+            """
+            This function checks if the current line has a larger font size than the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            prev_line_font_size : float
+                font size of the previous line
+            next_line_font_size : float
+                font size of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a larger font size than the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_size > other_font_size * 1.2
+                for other_font_size in [prev_line_font_size, next_line_font_size]
+                if other_font_size is not None
+            )
+
+        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
+            """
+            This function checks if the current line is similar to the previous line
+
+            Parameters
+            ----------
+            curr_line : dict
+                current line
+            prev_line : dict
+                previous line
+
+            Returns
+            -------
+            bool
+                True if the current line is similar to the previous line, False otherwise.
+            """
+
+            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
+                return True
+            else:
+                return False
+
+        def __is_same_font_type_of_docAvg(curr_line_font_type):
+            """
+            This function checks if the current line has the same font type as the document average font type
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line has the same font type as the document average font type, False otherwise.
+            """
+            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
+            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
+
+            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
+
+        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
+            """
+            This function checks if the current line has a large enough font size
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            ratio : float
+                ratio of the current line font size to the document average font size
+
+            Returns
+            -------
+            bool
+                True if the current line has a large enough font size, False otherwise.
+            """
+            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
+            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
+            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
+
+            return curr_line_font_size >= doc_avg_font_size * ratio
+
+        def __is_sufficient_spacing_above_and_below(
+            curr_line_bbox,
+            prev_line_bbox,
+            next_line_bbox,
+            avg_char_height,
+            median_font_size,
+        ):
+            """
+            This function checks if the current line has sufficient spacing above and below
+
+            Parameters
+            ----------
+            curr_line_bbox : list
+                bbox of the current line
+            prev_line_bbox : list
+                bbox of the previous line
+            next_line_bbox : list
+                bbox of the next line
+            avg_char_width : float
+                average of char widths
+            avg_char_height : float
+                average of line heights
+
+            Returns
+            -------
+            bool
+                True if the current line has sufficient spacing above and below, False otherwise.
+            """
+            vertical_ratio = 1.25
+            vertical_thres = vertical_ratio * median_font_size
+
+            _, y0, _, y1 = curr_line_bbox
+
+            sufficient_spacing_above = False
+            if prev_line_bbox:
+                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
+                sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            else:
+                sufficient_spacing_above = True
+
+            sufficient_spacing_below = False
+            if next_line_bbox:
+                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
+                sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            else:
+                sufficient_spacing_below = True
+
+            return (sufficient_spacing_above, sufficient_spacing_below)
+
+        def __is_word_list_line_by_rules(curr_line_text):
+            """
+            This function checks if the current line is a word list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
+            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
+
+            compiled_pattern = re.compile(name_list_pattern)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        def __get_text_catgr_by_nlp(curr_line_text):
+            """
+            This function checks if the current line is a name list using nlp model, such as spacy
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+
+            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
+
+            return result
+
+        def __is_numbered_title(curr_line_text):
+            """
+            This function checks if the current line is a numbered list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a numbered list, False otherwise.
+            """
+
+            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        def __is_end_with_ending_puncs(line_text):
+            """
+            This function checks if the current line ends with a ending punctuation mark
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line ends with a punctuation mark, False otherwise.
+            """
+            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+
+            line_text = line_text.rstrip()
+            if line_text[-1] in end_puncs:
+                return True
+
+            return False
+
+        def __contains_only_no_meaning_symbols(line_text):
+            """
+            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
+            Situation contains:
+            1. Only have punctuation marks
+            2. Only have other non-meaning symbols
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line contains only symbols that have no meaning, False otherwise.
+            """
+
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.9:
+                return True
+
+            return False
+
+        def __is_equation(line_text):
+            """
+            This function checks if the current line is an equation.
+
+            Parameters
+            ----------
+            line_text : str
+
+            Returns
+            -------
+            bool
+                True if the current line is an equation, False otherwise.
+            """
+            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
+
+            if re.search(equation_reg, line_text):
+                return True
+            else:
+                return False
+
+        def __is_title_by_len(text, max_length=200):
+            """
+            This function checks if the current line is a title by length.
+
+            Parameters
+            ----------
+            text : str
+                text of the current line
+
+            max_length : int
+                max length of the title
+
+            Returns
+            -------
+            bool
+                True if the current line is a title, False otherwise.
+
+            """
+            text = text.strip()
+            return len(text) <= max_length
+
+        def __compute_line_font_type_and_size(curr_line):
+            """
+            This function computes the font type and font size of the line.
+
+            Parameters
+            ----------
+            line : dict
+                line
+
+            Returns
+            -------
+            font_type : str
+                font type of the line
+            font_size : float
+                font size of the line
+            """
+            spans = curr_line["spans"]
+            max_accumulated_length = 0
+            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
+            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
+            for span in spans:
+                if span["text"].isspace():
+                    continue
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_accumulated_length:
+                    max_accumulated_length = span_length
+                    max_span_font_size = span["size"]
+                    max_span_font_type = span["font"].lower()
+
+            return max_span_font_type, max_span_font_size
+
+        def __is_a_consistent_sub_title(pre_line, curr_line):
+            """
+            This function checks if the current line is a consistent sub title.
+
+            Parameters
+            ----------
+            pre_line : dict
+                previous line
+            curr_line : dict
+                current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a consistent sub title, False otherwise.
+            """
+            if pre_line is None:
+                return False
+
+            start_letter_of_pre_line = pre_line["text"][0]
+            start_letter_of_curr_line = curr_line["text"][0]
+
+            has_same_prefix_digit = (
+                start_letter_of_pre_line.isdigit()
+                and start_letter_of_curr_line.isdigit()
+                and start_letter_of_pre_line == start_letter_of_curr_line
+            )
+
+            # prefix text of curr_line satisfies the following title format: x.x
+            prefix_text_pattern = r"^\d+\.\d+"
+            has_subtitle_format = re.match(prefix_text_pattern, curr_line["text"])
+
+            if has_same_prefix_digit or has_subtitle_format:
+                return True
+
+        """
+        Title detecting main Process.
+        """
+
+        """
+        Basic features about the current line.
+        """
+        curr_line_bbox = curr_line["bbox"]
+        curr_line_text = curr_line["text"]
+        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
+
+        if len(curr_line_text.strip()) == 0:  # skip empty lines
+            return False, False
+
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        if prev_line:
+            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
+        else:
+            prev_line_font_type, prev_line_font_size = None, None
+
+        next_line_bbox = next_line["bbox"] if next_line else None
+        if next_line:
+            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
+        else:
+            next_line_font_type, next_line_font_size = None, None
+
+        """
+        Aggregated features about the current line.
+        """
+        is_italc_font = __is_italic_font_line(curr_line)
+        is_bold_font = __is_bold_font_line(curr_line)
+
+        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
+        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
+        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
+
+        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
+
+        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
+
+        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
+        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
+
+        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
+
+        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
+        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
+
+        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
+            curr_line_font_size, prev_line_font_size, next_line_font_size
+        )
+
+        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
+            curr_line_font_type, prev_line_font_type, next_line_font_type
+        )
+
+        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
+        )
+
+        is_similar_to_pre_line = __is_similar_to_pre_line(
+            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
+        )
+
+        is_consis_sub_title = __is_a_consistent_sub_title(prev_line, curr_line)
+
+        """
+        Further aggregated features about the current line.
+        
+        Attention:
+            Features that start with __ are for internal use.
+        """
+
+        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
+        )
+        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
+        is_a_left_inline_title = (
+            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
+        )
+
+        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
+        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
+
+        is_title_by_check_pre_and_next_line = (
+            (prev_line is not None or next_line is not None)
+            and has_sufficient_spaces_above
+            and has_sufficient_spaces_below
+            and is_potential_title_font
+        )
+
+        is_numbered_title = __is_numbered_title(curr_line_text) and (
+            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
+        )
+
+        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
+
+        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
+
+        is_equation = __is_equation(curr_line_text)
+
+        is_title_by_len = __is_title_by_len(curr_line_text)
+
+        """
+        Decide if the line is a title.
+        """
+
+        is_title = (
+            is_not_end_with_ending_puncs  # not end with ending punctuation marks
+            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
+            and is_title_by_len  # is a title by length, default max length is 200
+            and not is_equation  # an interline equation should never be a title
+            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
+            and (
+                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                or (
+                    is_much_larger_font_than_doc_avg
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_font_size_little_less_than_doc_avg
+                    and is_bold_font
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+            )  # Consider the following situations: bold font, much larger font than doc avg, not same font type as doc avg, sufficient spacing above and below
+            and (
+                (
+                    not is_person_or_org_list_line_by_nlp
+                    and (
+                        is_much_larger_font_than_doc_avg
+                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                    )
+                )
+                or (
+                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
+                    and not is_a_left_inline_title
+                    and not is_punctuation_heavy
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_person_or_org_list_line_by_nlp
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                )
+                or (is_numbered_title and not is_a_left_inline_title)
+            )  # Exclude the following situations: person/org list
+        )
+        # ) or (prev_line_is_title and is_consis_sub_title)
+
+        is_name_or_org_list_to_be_removed = (
+            (is_person_or_org_list_line_by_nlp)
+            and is_punctuation_heavy
+            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+        ) and not is_title
+
+        if is_name_or_org_list_to_be_removed:
+            is_author_or_org_list = True
+        else:
+            is_author_or_org_list = False
+
+        # return is_title, is_author_or_org_list
+
+        """
+        # print reason why the line is a title
+        if is_title:
+            print_green("This line is a title.")
+            print_green("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+        # print reason why the line is not a title
+        line_text = curr_line_text.strip()
+        test_text = "Career/Personal Life"
+        text_content_condition = line_text == test_text
+        
+        if not is_title and text_content_condition: # Print specific line
+        # if not is_title: # Print each line
+            print_red("This line is not a title.")
+            print_red("↓" * 10)
+
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+            if is_not_end_with_ending_puncs:
+                print_green(f"is_not_end_with_ending_puncs")
+            else:
+                print_red(f"is_end_with_ending_puncs")
+
+            if is_not_only_no_meaning_symbols:
+                print_green(f"is_not_only_no_meaning_symbols")
+            else:
+                print_red(f"is_only_no_meaning_symbols")
+
+            if is_title_by_len:
+                print_green(f"is_title_by_len: {is_title_by_len}")
+            else:
+                print_red(f"is_not_title_by_len: {is_title_by_len}")
+
+            if is_equation:
+                print_red(f"is_equation")
+            else:
+                print_green(f"is_not_equation")
+
+            if is_potential_title_font:
+                print_green(f"is_potential_title_font")
+            else:
+                print_red(f"is_not_potential_title_font")
+
+            if is_punctuation_heavy:
+                print_red("is_punctuation_heavy")
+            else:
+                print_green("is_not_punctuation_heavy")
+
+            if is_bold_font:
+                print_green(f"is_bold_font")
+            else:
+                print_red(f"is_not_bold_font")
+
+            if is_font_size_not_less_than_doc_avg:
+                print_green(f"is_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_larger_font_than_doc_avg")
+
+            if is_much_larger_font_than_doc_avg:
+                print_green(f"is_much_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_much_larger_font_than_doc_avg")
+
+            if is_not_same_font_type_of_docAvg:
+                print_green(f"is_not_same_font_type_of_docAvg")
+            else:
+                print_red(f"is_same_font_type_of_docAvg")
+
+            if is_word_list_line_by_rules:
+                print_red("is_word_list_line_by_rules")
+            else:
+                print_green("is_not_name_list_by_rules")
+
+            if is_person_or_org_list_line_by_nlp:
+                print_red("is_person_or_org_list_line_by_nlp")
+            else:
+                print_green("is_not_person_or_org_list_line_by_nlp")
+
+            if not is_numbered_title:
+                print_red("is_not_numbered_title")
+            else:
+                print_green("is_numbered_title")
+
+            if is_a_left_inline_title:
+                print_red("is_a_left_inline_title")
+            else:
+                print_green("is_not_a_left_inline_title")
+
+            if not is_title_by_check_prev_line:
+                print_red("is_not_title_by_check_prev_line")
+            else:
+                print_green("is_title_by_check_prev_line")
+
+            if not is_title_by_check_next_line:
+                print_red("is_not_title_by_check_next_line")
+            else:
+                print_green("is_title_by_check_next_line")
+
+            if not is_title_by_check_pre_and_next_line:
+                print_red("is_not_title_by_check_pre_and_next_line")
+            else:
+                print_green("is_title_by_check_pre_and_next_line")
+
+        # print_green("Common features:")
+        # print_green("↓" * 10)
+
+        # print(f"    curr_line_font_type: {curr_line_font_type}")
+        # print(f"    curr_line_font_size: {curr_line_font_size}")
+        # print()
+
+        """
+
+        return is_title, is_author_or_org_list
+
+    def _detect_title(self, input_block):
+        """
+        Use the functions 'is_potential_title' to detect titles of each paragraph block.
+        If a line is a title, then the value of key 'is_title' of the line will be set to True.
+        """
+
+        raw_lines = input_block["lines"]
+
+        prev_line_is_title_flag = False
+
+        for i, curr_line in enumerate(raw_lines):
+            prev_line = raw_lines[i - 1] if i > 0 else None
+            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
+
+            blk_avg_char_width = input_block["avg_char_width"]
+            blk_avg_char_height = input_block["avg_char_height"]
+            blk_media_font_size = input_block["median_font_size"]
+
+            is_title, is_author_or_org_list = self._is_potential_title(
+                curr_line,
+                prev_line,
+                prev_line_is_title_flag,
+                next_line,
+                blk_avg_char_width,
+                blk_avg_char_height,
+                blk_media_font_size,
+            )
+
+            if is_title:
+                curr_line["is_title"] = is_title
+                prev_line_is_title_flag = True
+            else:
+                curr_line["is_title"] = False
+                prev_line_is_title_flag = False
+
+            # print(f"curr_line['text']: {curr_line['text']}")
+            # print(f"curr_line['is_title']: {curr_line['is_title']}")
+            # print(f"prev_line['text']: {prev_line['text'] if prev_line else None}")
+            # print(f"prev_line_is_title_flag: {prev_line_is_title_flag}")
+            # print()
+
+            if is_author_or_org_list:
+                curr_line["is_author_or_org_list"] = is_author_or_org_list
+            else:
+                curr_line["is_author_or_org_list"] = False
+
+        return input_block
+
+    def batch_detect_titles(self, pdf_dic):
+        """
+        This function batch process the blocks to detect titles.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        num_titles = 0
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in para_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    new_para_blocks = []
+                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_block = self._detect_title(para_block)
+                            new_para_blocks.append(new_block)
+                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                    else:  # All blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_para_blocks.append(para_block)
+                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
+                    para_blocks = new_para_blocks
+
+                blocks["para_blocks"] = para_blocks
+
+                for para_block in para_blocks:
+                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
+                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
+                    if (
+                        all_titles and para_text_len < 200
+                    ):  # total length of the paragraph is less than 200, more than this should not be a title
+                        para_block["is_block_title"] = 1
+                    else:
+                        para_block["is_block_title"] = 0
+
+                    all_name_or_org_list_to_be_removed = all(
+                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
+                    )
+                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
+                        para_block["is_block_an_author_or_org_list"] = 1
+                    else:
+                        para_block["is_block_an_author_or_org_list"] = 0
+
+        pdf_dic["statistics"]["num_titles"] = num_titles
+
+        return pdf_dic
+
+    def _recog_title_level(self, title_blocks):
+        """
+        This function determines the title level based on the font size of the title.
+
+        Parameters
+        ----------
+        title_blocks : list
+
+        Returns
+        -------
+        title_blocks : list
+        """
+
+        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
+
+        # Use the mean and std of font sizes to remove extreme values
+        mean_font_size = np.mean(font_sizes)
+        std_font_size = np.std(font_sizes)
+        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
+        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
+
+        # Compute the threshold for title level
+        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
+        if middle_font_sizes.size > 0:
+            middle_mean_font_size = np.mean(middle_font_sizes)
+            level_threshold = middle_mean_font_size
+        else:
+            level_threshold = mean_font_size
+
+        for tb in title_blocks:
+            title_block = tb["block"]
+            title_font_size = safe_get(title_block, "block_font_size", 0)
+
+            current_level = 1  # Initialize title level, the biggest level is 1
+
+            # print(f"Before adjustment by font size, {current_level}")
+            if title_font_size >= max_extreme_font_size:
+                current_level = 1
+            elif title_font_size <= min_extreme_font_size:
+                current_level = 3
+            elif float(title_font_size) >= float(level_threshold):
+                current_level = 2
+            else:
+                current_level = 3
+            # print(f"After adjustment by font size, {current_level}")
+
+            title_block["block_title_level"] = current_level
+
+        return title_blocks
+
+    def batch_recog_title_level(self, pdf_dic):
+        """
+        This function batch process the blocks to recognize title level.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        title_blocks = []
+
+        # Collect all titles
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = blocks.get("para_blocks", [])
+                for block in para_blocks:
+                    if block.get("is_block_title"):
+                        title_obj = {"page_id": page_id, "block": block}
+                        title_blocks.append(title_obj)
+
+        # Determine title level
+        if title_blocks:
+            # Determine title level based on font size
+            title_blocks = self._recog_title_level(title_blocks)
+
+        return pdf_dic
+
+
+class BlockTerminationProcessor:
+    """
+    This class is used to process the block termination.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def _is_consistent_lines(
+        self,
+        curr_line,
+        prev_line,
+        next_line,
+        consistent_direction,  # 0 for prev, 1 for next, 2 for both
+    ):
+        """
+        This function checks if the line is consistent with its neighbors
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        consistent_direction : int
+            0 for prev, 1 for next, 2 for both
+
+        Returns
+        -------
+        bool
+            True if the line is consistent with its neighbors, False otherwise.
+        """
+
+        curr_line_font_size = curr_line["spans"][0]["size"]
+        curr_line_font_type = curr_line["spans"][0]["font"].lower()
+
+        if consistent_direction == 0:
+            if prev_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 1:
+            if next_line:
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 2:
+            if prev_line and next_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
+                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+                )
+            else:
+                return False
+
+        else:
+            return False
+
+    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
+        """
+        This function checks if the line is a regular line
+
+        Parameters
+        ----------
+        curr_line_bbox : list
+            bbox of the current line
+        prev_line_bbox : list
+            bbox of the previous line
+        next_line_bbox : list
+            bbox of the next line
+        avg_char_width : float
+            average of char widths
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a regular line, False otherwise.
+        """
+        horizontal_ratio = 0.5
+        vertical_ratio = 0.5
+        horizontal_thres = horizontal_ratio * avg_char_width
+        vertical_thres = vertical_ratio * avg_line_height
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        x0_near_X0 = abs(x0 - X0) < horizontal_thres
+        x1_near_X1 = abs(x1 - X1) < horizontal_thres
+
+        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+
+        return (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (not x0_near_X0 and not x1_near_X1)
+            or prev_line_is_end_of_para
+        )
+
+    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
+        """
+        This function checks if the line is a possible start of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a possible start of a paragraph, False otherwise.
+        """
+        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        indent_ratio = 1
+
+        vertical_ratio = 1.5
+        vertical_thres = vertical_ratio * avg_font_size
+
+        left_horizontal_ratio = 0.5
+        left_horizontal_thres = left_horizontal_ratio * avg_char_width
+
+        right_horizontal_ratio = 2.5
+        right_horizontal_thres = right_horizontal_ratio * avg_char_width
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        indent_condition = x0 > X0 + indent_ratio * avg_char_width
+        if indent_condition:
+            start_confidence += 0.2
+            decision_path.append("indent_condition_met")
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
+        if x0_near_X0:
+            start_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
+        if x1_near_X1:
+            start_confidence += 0.1
+            decision_path.append("x1_near_X1")
+
+        if prev_line is None:
+            prev_line_is_end_of_para = True
+            start_confidence += 0.2
+            decision_path.append("no_prev_line")
+        else:
+            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
+            if prev_line_is_end_of_para:
+                start_confidence += 0.1
+                decision_path.append("prev_line_is_end_of_para")
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            if sufficient_spacing_above:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_above")
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            if sufficient_spacing_below:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_below")
+
+        is_regular_line = self._is_regular_line(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
+        )
+        if is_regular_line:
+            start_confidence += 0.1
+            decision_path.append("is_regular_line")
+
+        is_start_of_para = (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (indent_condition)
+            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
+            or prev_line_is_end_of_para
+        )
+        return (is_start_of_para, start_confidence, decision_path)
+
+    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
+        """
+        This function checks if the line is a possible end of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+
+        Returns
+        -------
+        bool
+            True if the line is a possible end of a paragraph, False otherwise.
+        """
+
+        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        left_horizontal_ratio = 0.5
+        right_horizontal_ratio = 0.5
+
+        x0, _, x1, y1 = curr_line_bbox
+        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
+        if x0_near_X0:
+            end_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
+        if x1_smaller_than_X1:
+            end_confidence += 0.1
+            decision_path.append("x1_smaller_than_X1")
+
+        next_line_is_start_of_para = (
+            next_line_bbox
+            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
+            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
+        )
+        if next_line_is_start_of_para:
+            end_confidence += 0.2
+            decision_path.append("next_line_is_start_of_para")
+
+        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if is_line_left_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_left_aligned_from_neighbors")
+
+        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if not is_line_right_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_not_right_aligned_from_neighbors")
+
+        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
+            (x0_near_X0 and x1_smaller_than_X1)
+            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
+        )
+
+        return (is_end_of_para, end_confidence, decision_path)
+
+    def _cut_paras_per_block(
+        self,
+        block,
+    ):
+        """
+        Processes a raw block from PyMuPDF and returns the processed block.
+
+        Parameters
+        ----------
+        raw_block : dict
+            A raw block from pymupdf.
+
+        Returns
+        -------
+        processed_block : dict
+
+        """
+
+        def _construct_para(lines, is_block_title, para_title_level):
+            """
+            Construct a paragraph from given lines.
+            """
+
+            font_sizes = [span["size"] for line in lines for span in line["spans"]]
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
+
+            font_colors = [span["color"] for line in lines for span in line["spans"]]
+            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
+
+            font_type_lengths = {}
+            for line in lines:
+                for span in line["spans"]:
+                    font_type = span["font"]
+                    bbox_width = span["bbox"][2] - span["bbox"][0]
+                    if font_type in font_type_lengths:
+                        font_type_lengths[font_type] += bbox_width
+                    else:
+                        font_type_lengths[font_type] = bbox_width
+
+            # get the font type with the longest bbox width
+            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
+
+            para_bbox = calculate_para_bbox(lines)
+            para_text = " ".join(line["text"] for line in lines)
+
+            return {
+                "para_bbox": para_bbox,
+                "para_text": para_text,
+                "para_font_type": most_common_font_type,
+                "para_font_size": avg_font_size,
+                "para_font_color": most_common_font_color,
+                "is_para_title": is_block_title,
+                "para_title_level": para_title_level,
+            }
+
+        block_bbox = block["bbox"]
+        block_text = block["text"]
+        block_lines = block["lines"]
+
+        X0 = safe_get(block, "X0", 0)
+        X1 = safe_get(block, "X1", 0)
+        avg_char_width = safe_get(block, "avg_char_width", 0)
+        avg_char_height = safe_get(block, "avg_char_height", 0)
+        avg_font_size = safe_get(block, "avg_font_size", 0)
+
+        is_block_title = safe_get(block, "is_block_title", False)
+        para_title_level = safe_get(block, "block_title_level", 0)
+
+        # Segment into paragraphs
+        para_ranges = []
+        in_paragraph = False
+        start_idx_of_para = None
+
+        # Create the processed paragraphs
+        processed_paras = {}
+        para_bboxes = []
+        end_idx_of_para = 0
+
+        for line_index, line in enumerate(block_lines):
+            curr_line = line
+            prev_line = block_lines[line_index - 1] if line_index > 0 else None
+            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
+
+            """
+            Start processing paragraphs.
+            """
+
+            # Check if the line is the start of a paragraph
+            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
+                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
+            )
+            if not in_paragraph and is_start_of_para:
+                in_paragraph = True
+                start_idx_of_para = line_index
+
+                # print_green(">>> Start of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    start_confidence: ", start_confidence)
+                # print("    decision_path: ", decision_path)
+
+            # Check if the line is the end of a paragraph
+            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
+                curr_line, next_line, X0, X1, avg_char_width
+            )
+            if in_paragraph and (is_end_of_para or not next_line):
+                para_ranges.append((start_idx_of_para, line_index))
+                start_idx_of_para = None
+                in_paragraph = False
+
+                # print_red(">>> End of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    end_confidence: ", end_confidence)
+                # print("    decision_path: ", decision_path)
+
+        # Add the last paragraph if it is not added
+        if in_paragraph and start_idx_of_para is not None:
+            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
+
+        # Process the matched paragraphs
+        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
+            matched_lines = block_lines[start_idx : end_idx + 1]
+            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
+            para_key = f"para_{len(processed_paras)}"
+            processed_paras[para_key] = para_properties
+            para_bboxes.append(para_properties["para_bbox"])
+            end_idx_of_para = end_idx + 1
+
+        # Deal with the remaining lines
+        if end_idx_of_para < len(block_lines):
+            unmatched_lines = block_lines[end_idx_of_para:]
+            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
+            unmatched_key = f"para_{len(processed_paras)}"
+            processed_paras[unmatched_key] = unmatched_properties
+            para_bboxes.append(unmatched_properties["para_bbox"])
+
+        block["paras"] = processed_paras
+
+        return block
+
+    def batch_process_blocks(self, pdf_dict):
+        """
+        Parses the blocks of all pages.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        filter_blocks : list
+            List of bounding boxes to filter.
+
+        Returns
+        -------
+        result_dict : dict
+            Result dictionary.
+
+        """
+
+        num_paras = 0
+
+        for page_id, page in pdf_dict.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page.keys():
+                    input_blocks = page["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self._cut_paras_per_block(input_block)
+                        para_blocks.append(new_block)
+                        num_paras += len(new_block["paras"])
+
+                page["para_blocks"] = para_blocks
+
+        pdf_dict["statistics"]["num_paras"] = num_paras
+        return pdf_dict
+
+
+class BlockContinuationProcessor:
+    """
+    This class is used to process the blocks to detect block continuations.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_similar_font_type(self, font_type_1, font_type_2, prefix_length_ratio=0.3):
+        """
+        This function checks if the two font types are similar.
+        Definition of similar font types: the two font types have a common prefix,
+        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
+
+        Parameters
+        ----------
+        font_type1 : str
+            font type 1
+        font_type2 : str
+            font type 2
+        prefix_length_ratio : float
+            minimum ratio of the common prefix length to the length of the shorter font type
+
+        Returns
+        -------
+        bool
+            True if the two font types are similar, False otherwise.
+        """
+
+        if isinstance(font_type_1, list):
+            font_type_1 = font_type_1[0] if font_type_1 else ""
+        if isinstance(font_type_2, list):
+            font_type_2 = font_type_2[0] if font_type_2 else ""
+
+        if font_type_1 == font_type_2:
+            return True
+
+        # Find the length of the common prefix
+        common_prefix_length = len(os.path.commonprefix([font_type_1, font_type_2]))
+
+        # Calculate the minimum prefix length based on the ratio
+        min_prefix_length = int(min(len(font_type_1), len(font_type_2)) * prefix_length_ratio)
+
+        return common_prefix_length >= min_prefix_length
+
+    def __is_same_block_font(self, block_1, block_2):
+        """
+        This function compares the font of block1 and block2
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 have the same font, else False
+        """
+        block_1_font_type = safe_get(block_1, "block_font_type", "")
+        block_1_font_size = safe_get(block_1, "block_font_size", 0)
+        block_1_avg_char_width = safe_get(block_1, "avg_char_width", 0)
+
+        block_2_font_type = safe_get(block_2, "block_font_type", "")
+        block_2_font_size = safe_get(block_2, "block_font_size", 0)
+        block_2_avg_char_width = safe_get(block_2, "avg_char_width", 0)
+
+        if isinstance(block_1_font_size, list):
+            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
+        if isinstance(block_2_font_size, list):
+            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
+
+        block_1_text = safe_get(block_1, "text", "")
+        block_2_text = safe_get(block_2, "text", "")
+
+        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
+            return False
+
+        if not block_1_text or not block_2_text:
+            return False
+        else:
+            text_len_ratio = len(block_2_text) / len(block_1_text)
+            if text_len_ratio < 0.2:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.5
+                )
+            else:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.2
+                )
+
+        block_font_size_condition = abs(block_1_font_size - block_2_font_size) < 1
+
+        return (
+            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
+            and avg_char_width_condition
+            and block_font_size_condition
+        )
+
+    def _is_alphabet_char(self, char):
+        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
+            return True
+        else:
+            return False
+
+    def _is_chinese_char(self, char):
+        if char >= "\u4e00" and char <= "\u9fa5":
+            return True
+        else:
+            return False
+
+    def _is_other_letter_char(self, char):
+        try:
+            cat = unicodedata.category(char)
+            if cat == "Lu" or cat == "Ll":
+                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
+        except TypeError:
+            print("The input to the function must be a single character.")
+        return False
+
+    def _is_year(self, s: str):
+        try:
+            number = int(s)
+            return 1900 <= number <= 2099
+        except ValueError:
+            return False
+
+    def _match_brackets(self, text):
+        # pattern = r"^[\(\)\[\]（）【】{}｛｝<>＜＞〔〕〘〙\"\'“”‘’]"
+        pattern = r"^[\(\)\]（）】{}｛｝>＞〕〙\"\'“”‘’]"
+        return bool(re.match(pattern, text))
+
+    def _is_para_font_consistent(self, para_1, para_2):
+        """
+        This function compares the font of para1 and para2
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 have the same font, else False
+        """
+        if para_1 is None or para_2 is None:
+            return False
+
+        para_1_font_type = safe_get(para_1, "para_font_type", "")
+        para_1_font_size = safe_get(para_1, "para_font_size", 0)
+        para_1_font_color = safe_get(para_1, "para_font_color", "")
+
+        para_2_font_type = safe_get(para_2, "para_font_type", "")
+        para_2_font_size = safe_get(para_2, "para_font_size", 0)
+        para_2_font_color = safe_get(para_2, "para_font_color", "")
+
+        if isinstance(para_1_font_type, list):  # get the most common font type
+            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
+        if isinstance(para_2_font_type, list):
+            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
+        if isinstance(para_1_font_size, list):  # compute average font type
+            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
+        if isinstance(para_2_font_size, list):  # compute average font type
+            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
+
+        return (
+            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
+            and abs(para_1_font_size - para_2_font_size) < 1.5
+            # and para_font_color1 == para_font_color2
+        )
+
+    def _is_para_puncs_consistent(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph by using the puncs, else False
+        """
+        para_1_text = safe_get(para_1, "para_text", "").strip()
+        para_2_text = safe_get(para_2, "para_text", "").strip()
+
+        para_1_bboxes = safe_get(para_1, "para_bbox", [])
+        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
+
+        para_2_bboxes = safe_get(para_2, "para_bbox", [])
+        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
+
+        # print_yellow("    Features of determine puncs_consistent:")
+        # print(f"    para_1_text: {para_1_text}")
+        # print(f"    para_2_text: {para_2_text}")
+        # print(f"    para_1_bboxes: {para_1_bboxes}")
+        # print(f"    para_2_bboxes: {para_2_bboxes}")
+        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
+        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
+
+        if is_nested_list(para_1_bboxes):
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
+        else:
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
+
+        if is_nested_list(para_2_bboxes):
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
+            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
+        else:
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
+
+        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
+
+        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
+        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
+
+        # Check if either para_text1 or para_text2 is empty
+        if not para_1_text or not para_2_text:
+            return False
+
+        # Define the end puncs for a sentence to end and hyphen
+        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+        hyphen = ["-", "—"]
+
+        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
+        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
+        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
+        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
+        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
+
+        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
+        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
+        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
+        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
+
+        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
+            # print_red(f"para_1 is end with hyphen.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] in hyphen
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
+            # print_red(f"para_1 is end with end_punc.")
+            para_2_is_consistent = (
+                para_2_text
+                and (
+                    para_2_text[0]
+                    == " "
+                    # or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
+                    # or (self._is_chinese_char(para_2_text[0]))
+                    # or (self._is_other_letter_char(para_2_text[0]))
+                )
+                and not is_para2_left_indent_than_papa1
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
+            # print_red(f"para_1 is NOT end with end_punc.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_alphabet_char(para_2_text[0]))
+                or (self._is_year(para_2_text[0:4]))
+                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+                or (self._match_brackets(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_space:  # If para_text1 ends with space
+            # print_red(f"para_1 is end with space.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                pass
+                # print(f"para_2 is not consistent.\n")
+
+        return False
+
+    def _is_block_consistent(self, block_1, block_2):
+        """
+        This function determines whether block1 and block2 are originally from the same block
+
+        Parameters
+        ----------
+        block1 : dict
+            block1s
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 are from the same block, else False
+        """
+        return self.__is_same_block_font(block_1, block_2)
+
+    def _is_para_continued(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph, else False
+        """
+        is_para_font_consistent = self._is_para_font_consistent(para_1, para_2)
+        is_para_puncs_consistent = self._is_para_puncs_consistent(para_1, para_2)
+
+        return is_para_font_consistent and is_para_puncs_consistent
+
+    def _are_boundaries_of_block_consistent(self, block_1, block_2):
+        """
+        This function checks if the boundaries of block1 and block2 are consistent
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_consistent : bool
+            True if the boundaries of block1 and block2 are consistent, else False
+        """
+
+        last_line_of_block_1 = block_1["lines"][-1]
+        first_line_of_block_2 = block_2["lines"][0]
+
+        spans_of_last_line_of_block_1 = last_line_of_block_1["spans"]
+        spans_of_first_line_of_block_2 = first_line_of_block_2["spans"]
+
+        font_type_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["font"].lower()
+        font_size_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["size"]
+        font_color_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["color"]
+        font_flags_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["flags"]
+
+        font_type_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["font"].lower()
+        font_size_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["size"]
+        font_color_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["color"]
+        font_flags_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["flags"]
+
+        return (
+            self.__is_similar_font_type(font_type_of_last_line_of_block_1, font_type_of_first_line_of_block_2)
+            and abs(font_size_of_last_line_of_block_1 - font_size_of_first_line_of_block_2) < 1
+            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
+            and font_flags_of_last_line_of_block_1 == font_flags_of_first_line_of_block_2
+        )
+
+    def should_merge_next_para(self, curr_para, next_para):
+        """
+        This function checks if the next_para should be merged into the curr_para.
+
+        Parameters
+        ----------
+        curr_para : dict
+            The current paragraph.
+        next_para : dict
+            The next paragraph.
+
+        Returns
+        -------
+        bool
+            True if the next_para should be merged into the curr_para, False otherwise.
+        """
+        if self._is_para_continued(curr_para, next_para):
+            return True
+        else:
+            return False
+
+    def batch_tag_paras(self, pdf_dict):
+        """
+        This function tags the paragraphs in the pdf_dict.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with tagged paragraphs.
+        """
+        the_last_page_id = len(pdf_dict) - 1
+
+        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
+            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
+                para_blocks_of_curr_page = curr_page_content["para_blocks"]
+                next_page_idx = curr_page_idx + 1
+                next_page_id = f"page_{next_page_idx}"
+                next_page_content = pdf_dict.get(next_page_id, {})
+
+                for i, current_block in enumerate(para_blocks_of_curr_page):
+                    for para_id, curr_para in current_block["paras"].items():
+                        curr_para["curr_para_location"] = [
+                            curr_page_idx,
+                            current_block["block_id"],
+                            int(para_id.split("_")[-1]),
+                        ]
+                        curr_para["next_para_location"] = None  # 默认设置为None
+                        curr_para["merge_next_para"] = False  # 默认设置为False
+
+                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
+
+                    if next_block:
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        next_block_first_para_key = list(next_block["paras"].keys())[0]
+                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
+
+                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                            curr_blk_last_para["next_para_location"] = [
+                                curr_page_idx,
+                                next_block["block_id"],
+                                int(next_block_first_para_key.split("_")[-1]),
+                            ]
+                            curr_blk_last_para["merge_next_para"] = True
+                    else:
+                        # Handle the case where the next block is in a different page
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
+                            next_page_idx += 1
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id, {})
+
+                        if next_page_content.get("para_blocks", []):
+                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
+                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
+
+                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                                curr_blk_last_para["next_para_location"] = [
+                                    next_page_idx,
+                                    next_page_content["para_blocks"][0]["block_id"],
+                                    int(next_blk_first_para_key.split("_")[-1]),
+                                ]
+                                curr_blk_last_para["merge_next_para"] = True
+
+        return pdf_dict
+
+    def find_block_by_id(self, para_blocks, block_id):
+        """
+        This function finds a block by its id.
+
+        Parameters
+        ----------
+        para_blocks : list
+            List of blocks.
+        block_id : int
+            Id of the block to find.
+
+        Returns
+        -------
+        block : dict
+            The block with the given id.
+        """
+        for blk_idx, block in enumerate(para_blocks):
+            if block.get("block_id") == block_id:
+                return block
+        return None
+
+    def batch_merge_paras(self, pdf_dict):
+        """
+        This function merges the paragraphs in the pdf_dict.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with merged paragraphs.
+        """
+        for page_id, page_content in pdf_dict.items():
+            if page_id.startswith("page_") and page_content.get("para_blocks", []):
+                para_blocks_of_page = page_content["para_blocks"]
+
+                for i in range(len(para_blocks_of_page)):
+                    current_block = para_blocks_of_page[i]
+                    paras = current_block["paras"]
+
+                    for para_id, curr_para in list(paras.items()):
+                        # print(f"current para_id: {para_id}")
+                        # 跳过标题段落
+                        if curr_para.get("is_para_title"):
+                            continue
+
+                        while curr_para.get("merge_next_para"):
+                            curr_para_location = curr_para.get("curr_para_location")
+                            next_para_location = curr_para.get("next_para_location")
+
+                            # print(f"curr_para_location: {curr_para_location}, next_para_location: {next_para_location}")
+                            
+                            if not next_para_location:
+                                break
+
+                            if curr_para_location == next_para_location:
+                                # print_red("The next para is in the same block as the current para.")
+                                curr_para["merge_next_para"] = False
+                                break
+
+                            next_page_idx, next_block_id, next_para_id = next_para_location
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id)
+                            if not next_page_content:
+                                break
+
+                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
+
+                            if not next_block:
+                                break
+
+                            next_para = next_block["paras"].get(f"para_{next_para_id}")
+
+                            if not next_para or next_para.get("is_para_title"):
+                                break
+
+                            # 合并段落文本
+                            curr_para_text = curr_para.get("para_text", "")
+                            next_para_text = next_para.get("para_text", "")
+                            curr_para["para_text"] = curr_para_text + " " + next_para_text
+
+                            # 更新 next_para_location
+                            curr_para["next_para_location"] = next_para.get("next_para_location")
+
+                            # 将下一个段落文本置为空，表示已被合并
+                            next_para["para_text"] = ""
+
+                            # 更新 merge_next_para 标记
+                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
+
+        return pdf_dict
+
+
+class DrawAnnos:
+    """
+    This class draws annotations on the pdf file
+
+    ----------------------------------------
+                Color Code
+    ----------------------------------------
+        Red: (1, 0, 0)
+        Green: (0, 1, 0)
+        Blue: (0, 0, 1)
+        Yellow: (1, 1, 0) - mix of red and green
+        Cyan: (0, 1, 1) - mix of green and blue
+        Magenta: (1, 0, 1) - mix of red and blue
+        White: (1, 1, 1) - red, green and blue full intensity
+        Black: (0, 0, 0) - no color component whatsoever
+        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
+        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_nested_list(self, lst):
+        """
+        This function returns True if the given list is a nested list of any degree.
+        """
+        if isinstance(lst, list):
+            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
+        return False
+
+    def __valid_rect(self, bbox):
+        # Ensure that the rectangle is not empty or invalid
+        if isinstance(bbox[0], list):
+            return False  # It's a nested list, hence it can't be valid rect
+        else:
+            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
+
+    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
+        """
+        This function draws the nested boxes
+
+        Parameters
+        ----------
+        page : fitz.Page
+            page
+        nested_bbox : list
+            nested bbox
+        color : tuple
+            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
+        """
+        if self.__is_nested_list(nested_bbox):  # If it's a nested list
+            for bbox in nested_bbox:
+                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
+        elif self.__valid_rect(nested_bbox):  # If valid rectangle
+            para_rect = fitz.Rect(nested_bbox)
+            para_anno = page.add_rect_annot(para_rect)
+            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
+            para_anno.set_border(width=1)
+            para_anno.update()
+
+    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
+        """
+        This function draws annotations on the pdf file.
+
+        Parameters
+        ----------
+        input_pdf_path : str
+            path to the input pdf file
+        pdf_dic : dict
+            pdf dictionary
+        output_pdf_path : str
+            path to the output pdf file
+
+        pdf_dic : dict
+            pdf dictionary
+        """
+        pdf_doc = open_pdf(input_pdf_path)
+
+        if pdf_dic is None:
+            pdf_dic = {}
+
+        if output_pdf_path is None:
+            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
+
+        for page_id, page in enumerate(pdf_doc):  # type: ignore
+            page_key = f"page_{page_id}"
+            for ele_key, ele_data in pdf_dic[page_key].items():
+                if ele_key == "para_blocks":
+                    para_blocks = ele_data
+                    for para_block in para_blocks:
+                        if "paras" in para_block.keys():
+                            paras = para_block["paras"]
+                            for para_key, para_content in paras.items():
+                                para_bbox = para_content["para_bbox"]
+                                # print(f"para_bbox: {para_bbox}")
+                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
+                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
+                                    color = (0, 1, 1)
+                                    self.__draw_nested_boxes(
+                                        page, para_bbox, color
+                                    )  # draw with cyan color for combined paragraph
+                                else:
+                                    if self.__valid_rect(para_bbox):
+                                        para_rect = fitz.Rect(para_bbox)
+                                        para_anno = page.add_rect_annot(para_rect)
+                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
+                                        para_anno.set_border(width=0.5)
+                                        para_anno.update()
+
+                                is_para_title = para_content["is_para_title"]
+                                if is_para_title:
+                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
+                                        color = (0, 0, 1)
+                                        self.__draw_nested_boxes(
+                                            page, para_content["para_bbox"], color
+                                        )  # draw with cyan color for combined title
+                                    else:
+                                        if self.__valid_rect(para_content["para_bbox"]):
+                                            para_rect = fitz.Rect(para_content["para_bbox"])
+                                            if self.__valid_rect(para_content["para_bbox"]):
+                                                para_anno = page.add_rect_annot(para_rect)
+                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
+                                                para_anno.set_border(width=0.5)
+                                                para_anno.update()
+
+        pdf_doc.save(output_pdf_path)
+        pdf_doc.close()
+
+
+class ParaProcessPipeline:
+    def __init__(self) -> None:
+        pass
+
+    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
+        """
+        This function processes the paragraphs, including:
+        1. Read raw input json file into pdf_dic
+        2. Detect and replace equations
+        3. Combine spans into a natural line
+        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        5. Compute statistics for each block
+        6. Detect titles in the document
+        7. Detect paragraphs inside each block
+        8. Divide the level of the titles
+        9. Detect and combine paragraphs from different blocks into one paragraph
+        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
+        11. Draw annotations on the pdf file
+
+        Parameters
+        ----------
+        pdf_dic_json_fpath : str
+            path to the pdf dictionary json file.
+            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
+        input_pdf_doc : str
+            path to the input pdf file
+        output_pdf_path : str
+            path to the output pdf file
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+
+        error_info = None
+
+        output_json_file = ""
+        output_dir = ""
+
+        if input_pdf_path is not None:
+            input_pdf_path = os.path.abspath(input_pdf_path)
+
+            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
+
+        if output_pdf_path is not None:
+            output_dir = os.path.dirname(output_pdf_path)
+            output_json_file = f"{output_dir}/pdf_dic.json"
+
+        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
+            """
+            Save the pdf_dic to a json file
+            """
+            output_pdf_file_name = os.path.basename(output_pdf_path)
+            # output_dir = os.path.dirname(output_pdf_path)
+            output_dir = "\\tmp\\pdf_parse"
+            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
+            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
+
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            if para_debug_mode == "full":
+                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
+                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
+
+            # Validate the output already exists
+            if not os.path.exists(pdf_dic_json_fpath):
+                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
+                return None
+            else:
+                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
+
+            return pdf_dic_json_fpath
+
+        """
+        Preprocess the lines of block
+        """
+        # Combine spans into a natural line
+        rawBlockProcessor = RawBlockProcessor()
+        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        layoutFilter = LayoutFilterProcessor()
+        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
+
+        # Compute statistics for each block
+        blockStatisticsCalculator = BlockStatisticsCalculator()
+        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Compute statistics for all blocks(namely this pdf document)
+        docStatisticsCalculator = DocStatisticsCalculator()
+        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
+        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
+
+        # Dump the first three stages of pdf_dic to a json file
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
+
+        """
+        Detect titles in the document
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and divide the level of the titles
+        """
+        titleProcessor = TitleProcessor()
+
+        pdf_dic = titleProcessor.batch_recog_title_level(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and split paragraphs inside each block
+        """
+        blockInnerParasProcessor = BlockTerminationProcessor()
+
+        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Detect and combine paragraphs from different blocks into one paragraph
+        """
+        blockContinuationProcessor = BlockContinuationProcessor()
+
+        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
+        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Discard pdf files by checking exceptions and return the error info to the caller
+        """
+        discardByException = DiscardByException()
+
+        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
+            pdf_dic, exception=DenseSingleLineBlockException()
+        )
+        is_discard_by_title_detection = discardByException.discard_by_title_detection(
+            pdf_dic, exception=TitleDetectionException()
+        )
+        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
+        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
+        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
+
+        if is_discard_by_single_line_block is not None:
+            error_info = is_discard_by_single_line_block
+        elif is_discard_by_title_detection is not None:
+            error_info = is_discard_by_title_detection
+        elif is_discard_by_title_level is not None:
+            error_info = is_discard_by_title_level
+        elif is_discard_by_split_para is not None:
+            error_info = is_discard_by_split_para
+        elif is_discard_by_merge_para is not None:
+            error_info = is_discard_by_merge_para
+
+        if error_info is not None:
+            return pdf_dic, error_info
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+        if para_debug_mode is not None:
+            drawAnnos = DrawAnnos()
+            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
+
+        """
+        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
+        """
+        if para_debug_mode is not None:
+            for fpath in os.listdir(output_dir):
+                if fpath.endswith(".json") and "stage" in fpath:
+                    os.remove(os.path.join(output_dir, fpath))
+
+        return pdf_dic, error_info
+
+
+"""
+Run this script to test the function with Command: 
+
+python detect_para.py [pdf_path] [output_pdf_path]
+
+Params:
+- pdf_path: the path of the pdf file
+- output_pdf_path: the path of the output pdf file
+"""
+
+if __name__ == "__main__":
+    DEFAULT_PDF_PATH = (
+        "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
+    )
+    input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
+    output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
+    output_json_path = sys.argv[3] if len(sys.argv) > 3 else input_pdf_path.split(".")[0] + "_recogPara.json"
+
+    import stat
+
+    # Remove existing output file if it exists
+    if os.path.exists(output_pdf_path):
+        os.chmod(output_pdf_path, stat.S_IWRITE)
+        os.remove(output_pdf_path)
+
+    input_pdf_doc = open_pdf(input_pdf_path)
+
+    # postprocess the paragraphs
+    paraProcessPipeline = ParaProcessPipeline()
+
+    # parse paragraph and save to json file
+    pdf_dic = {}
+
+    blockInnerParasProcessor = BlockTerminationProcessor()
+
+    """
+    Construct the pdf dictionary.
+    """
+
+    for page_id, page in enumerate(input_pdf_doc):  # type: ignore
+        # print(f"Processing page {page_id}")
+        # print(f"page: {page}")
+        raw_blocks = page.get_text("dict")["blocks"]
+
+        # Save text blocks to "preproc_blocks"
+        preproc_blocks = []
+        for block in raw_blocks:
+            if block["type"] == 0:
+                preproc_blocks.append(block)
+
+        layout_bboxes = []
+
+        # Construct the pdf dictionary as schema above
+        page_dict = {
+            "para_blocks": None,
+            "preproc_blocks": preproc_blocks,
+            "images": None,
+            "tables": None,
+            "interline_equations": None,
+            "inline_equations": None,
+            "layout_bboxes": None,
+            "pymu_raw_blocks": None,
+            "global_statistic": None,
+            "droped_text_block": None,
+            "droped_image_block": None,
+            "droped_table_block": None,
+            "image_backup": None,
+            "table_backup": None,
+        }
+
+        pdf_dic[f"page_{page_id}"] = page_dict
+
+    # print(f"pdf_dic: {pdf_dic}")
+
+    with open(output_json_path, "w", encoding="utf-8") as f:
+        json.dump(pdf_dic, f, ensure_ascii=False, indent=4)
+
+    pdf_dic = paraProcessPipeline.para_process_pipeline(output_json_path, input_pdf_doc, output_pdf_path)
--- a/magic_pdf/post_proc/pdf_post_filter.py
+++ b/magic_pdf/post_proc/pdf_post_filter.py
+from loguru import logger
+
+from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
+from magic_pdf.libs.drop_reason import DropReason
+
+
+def __is_pseudo_single_column(page_info) -> bool:
+    """
+    判断一个页面是否伪单列。
+
+    Args:
+        page_info (dict): 页面信息字典，包括'_layout_tree'和'preproc_blocks'。
+
+    Returns:
+        Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info)，否则返回(False, None)。
+
+    """
+    layout_tree = page_info['_layout_tree']
+    layout_column_width = get_columns_cnt_of_layout(layout_tree)
+    if layout_column_width == 1:
+        text_blocks = page_info['preproc_blocks']
+        # 遍历每一个text_block
+        for text_block in text_blocks:
+            lines = text_block['lines']
+            num_lines = len(lines)
+            num_satisfying_lines = 0
+
+            for i in range(num_lines - 1):
+                current_line = lines[i]
+                next_line = lines[i + 1]
+
+                # 获取当前line和下一个line的bbox属性
+                current_bbox = current_line['bbox']
+                next_bbox = next_line['bbox']
+
+                # 检查是否满足条件
+                if next_bbox[0] > current_bbox[2] or next_bbox[2] < current_bbox[0]:
+                    num_satisfying_lines += 1
+            # 如果有一半以上的line满足条件，就drop
+            # print("num_satisfying_lines:", num_satisfying_lines, "num_lines:", num_lines)
+            if num_lines > 20:
+                radio = num_satisfying_lines / num_lines
+                if radio >= 0.5:
+                    extra_info = f"{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}"
+                    block_text = []
+                    for line in lines:
+                        if line['spans']:
+                            for span in line['spans']:
+                                block_text.append(span['text'])
+                    logger.warning(f"pseudo_single_column block_text: {block_text}")
+                    return True, extra_info
+
+    return False, None
+
+
+def pdf_post_filter(page_info) -> tuple:
+    """
+    return:(True|False, err_msg)
+        True, 如果pdf符合要求
+        False, 如果pdf不符合要求
+
+    """
+    bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
+    if bool_is_pseudo_single_column:
+        return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
+
+    return True, None
\ No newline at end of file
--- a/magic_pdf/post_proc/remove_footnote.py
+++ b/magic_pdf/post_proc/remove_footnote.py
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
+import collections      # 统计库
+
+
+
+def is_below(bbox1, bbox2):
+    # 如果block1的上边y坐标大于block2的下边y坐标，那么block1在block2下面
+    return bbox1[1] > bbox2[3]
+
+
+def merge_bboxes(bboxes):
+    # 找出所有blocks的最小x0，最大y1，最大x1，最小y0，这就是合并后的bbox
+    x0 = min(bbox[0] for bbox in bboxes)
+    y0 = min(bbox[1] for bbox in bboxes)
+    x1 = max(bbox[2] for bbox in bboxes)
+    y1 = max(bbox[3] for bbox in bboxes)
+    return [x0, y0, x1, y1]
+
+
+def merge_footnote_blocks(page_info, main_text_font):
+    page_info['merged_bboxes'] = []
+    for layout in page_info['layout_bboxes']:
+        # 找出layout中的所有footnote blocks和preproc_blocks
+        footnote_bboxes = [block for block in page_info['footnote_bboxes_tmp'] if _is_in(block, layout['layout_bbox'])]
+        # 如果没有footnote_blocks，就跳过这个layout
+        if not footnote_bboxes:
+            continue
+
+        preproc_blocks = [block for block in page_info['preproc_blocks'] if _is_in(block['bbox'], layout['layout_bbox'])]
+        # preproc_bboxes = [block['bbox'] for block in preproc_blocks]
+        font_names = collections.Counter()
+        if len(preproc_blocks) > 0:
+            # 存储每一行的文本块大小的列表
+            line_sizes = []
+            # 存储每个文本块的平均行大小
+            block_sizes = []
+            for block in preproc_blocks:
+                block_line_sizes = []
+                block_fonts = collections.Counter()
+                for line in block['lines']:
+                    # 提取每个span的size属性，并计算行大小
+                    span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
+                    if span_sizes:
+                        line_size = sum(span_sizes) / len(span_sizes)
+                        line_sizes.append(line_size)
+                        block_line_sizes.append(line_size)
+                    span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
+                                 'font' in span and len(span['text']) > 0]
+                    if span_font:
+                        # # todo main_text_font应该用基于字数最多的字体而不是span级别的统计
+                        # font_names.append(font_name for font_name in span_font)
+                        # block_fonts.append(font_name for font_name in span_font)
+                        for font, count in span_font:
+                            # font_names.extend([font] * count)
+                            # block_fonts.extend([font] * count)
+                            font_names[font] += count
+                            block_fonts[font] += count
+                if block_line_sizes:
+                    # 计算文本块的平均行大小
+                    block_size = sum(block_line_sizes) / len(block_line_sizes)
+                    block_font = block_fonts.most_common(1)[0][0]
+                    block_sizes.append((block, block_size, block_font))
+
+            # 计算main_text_size
+            # main_text_font = font_names.most_common(1)[0][0]
+            main_text_size = collections.Counter(line_sizes).most_common(1)[0][0]
+        else:
+            continue
+
+        need_merge_bboxes = []
+        # 任何一个下面有正文block的footnote bbox都是假footnote
+        for footnote_bbox in footnote_bboxes:
+            # 检测footnote下面是否有正文block(正文block需满足，block平均size大于等于main_text_size，且block行数大于等于5)
+            main_text_bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if
+                                      is_below(block['bbox'], footnote_bbox) and
+                                      sum([size >= main_text_size,
+                                           len(block['lines']) >= 5,
+                                           block_font == main_text_font])
+                                      >= 2]
+            # 如果main_text_bboxes_below不为空，说明footnote下面有正文block，这个footnote不成立，跳过
+            if len(main_text_bboxes_below) > 0:
+                continue
+            else:
+                # 否则，说明footnote下面没有正文block，这个footnote成立，添加到待merge的footnote_bboxes中
+                need_merge_bboxes.append(footnote_bbox)
+        if len(need_merge_bboxes) == 0:
+            continue
+        # 找出最靠上的footnote block
+        top_footnote_bbox = min(need_merge_bboxes, key=lambda bbox: bbox[1])
+        # 找出所有在top_footnote_block下面的preproc_blocks，并确保这些preproc_blocks的平均行大小小于main_text_size
+        bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if is_below(block['bbox'], top_footnote_bbox)]
+        # # 找出所有在top_footnote_block下面的preproc_blocks
+        # bboxes_below = [bbox for bbox in preproc_bboxes if is_below(bbox, top_footnote_bbox)]
+        # 合并top_footnote_block和blocks_below
+        merged_bbox = merge_bboxes([top_footnote_bbox] + bboxes_below)
+        # 添加到新的footnote_bboxes_tmp中
+        page_info['merged_bboxes'].append(merged_bbox)
+    return page_info
+
+
+def remove_footnote_blocks(page_info):
+    if page_info.get('merged_bboxes'):
+        # 从文字中去掉footnote
+        remain_text_blocks, removed_footnote_text_blocks = remove_footnote_text(page_info['preproc_blocks'], page_info['merged_bboxes'])
+        # 从图片中去掉footnote
+        image_blocks, removed_footnote_imgs_blocks = remove_footnote_image(page_info['images'], page_info['merged_bboxes'])
+        # 更新page_info
+        page_info['preproc_blocks'] = remain_text_blocks
+        page_info['images'] = image_blocks
+        page_info['droped_text_block'].extend(removed_footnote_text_blocks)
+        page_info['droped_image_block'].extend(removed_footnote_imgs_blocks)
+        # 删除footnote_bboxes_tmp和merged_bboxes
+        del page_info['merged_bboxes']
+    del page_info['footnote_bboxes_tmp']
+    return page_info
+
+
+def remove_footnote_text(raw_text_block, footnote_bboxes):
+    """
+    :param raw_text_block: str类型，是当前页的文本内容
+    :param footnoteBboxes: list类型，是当前页的脚注bbox
+    """
+    footnote_text_blocks = []
+    for block in raw_text_block:
+        text_bbox = block['bbox']
+        # TODO 更严谨点在line级别做
+        if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
+            # if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
+            block['tag'] = 'footnote'
+            footnote_text_blocks.append(block)
+            # raw_text_block.remove(block)
+
+    # 移除，不能再内部移除，否则会出错
+    for block in footnote_text_blocks:
+        raw_text_block.remove(block)
+
+    return raw_text_block, footnote_text_blocks
+
+
+def remove_footnote_image(image_blocks, footnote_bboxes):
+    """
+    :param image_bboxes: list类型，是当前页的图片bbox(结构体)
+    :param footnoteBboxes: list类型，是当前页的脚注bbox
+    """
+    footnote_imgs_blocks = []
+    for image_block in image_blocks:
+        if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
+            footnote_imgs_blocks.append(image_block)
+
+    for footnote_imgs_block in footnote_imgs_blocks:
+        image_blocks.remove(footnote_imgs_block)
+
+    return image_blocks, footnote_imgs_blocks
\ No newline at end of file
--- a/magic_pdf/post_proc/remove_spaces_html.py
+++ b/magic_pdf/post_proc/remove_spaces_html.py
+import re
+import os
+
+
+
+def clean_urls(pattern,txt):
+         return re.sub(pattern, lambda x: x.group(0).replace(' ', ''), txt)
+
+def remove_extra_spaces_html_txt(file_path):
+    # 创建一个临时文件用于存储清理后的内容
+    temp_file_path = file_path + '.tmp'
+
+    # 正则表达式匹配以 http 或 https 开头的链接，允许中间有不规则空格
+    pattern1 = r'(http ?s? ?:  ?/  ?/ ?[\w.-]+)'
+    pattern2 = r'(https?://[\w. %/:-]+)'
+
+    with open(file_path, 'r', encoding='utf-8') as file, open(temp_file_path, 'w', encoding='utf-8') as temp_file:
+        for line in file:
+            # 去掉行末的换行符后再处理
+            line_content = line.rstrip('\n')
+            line_content = clean_urls(pattern1,line_content)
+            line_content = clean_urls(pattern2,line_content)
+
+            # 将处理后的内容和原换行符写入临时文件
+            temp_file.write(line_content + '\n')
+
+    # 用处理后的内容替换原文件
+    os.replace(temp_file_path, file_path)
+
+if __name__ == '__main__':
+    file_path = 'DCU.txt'
+    remove_extra_spaces_html_txt(file_path)
+
+#
+# import re
+#
+#
+# def remove_extra_spaces_html_txt(text):
+#
+#
+#     # Remove extra spaces in URLs
+#     cleaned_text = re.sub(r'(http ?s? ?:  ?/  ?/ ?[\w.-]+)', lambda x: x.group(0).replace(' ', ''), text)
+#     print(cleaned_text)
+#     # Function to remove spaces within URLs
+#     def clean_urls(text):
+#         return re.sub(r'(https?://[\w. %/:-]+)', lambda x: x.group(0).replace(' ', ''), text)
+#
+#     cleaned_text = clean_urls(cleaned_text)
+#     print(cleaned_text)
+# #
+# text = """
+# 预测：使用海光CPU/DCU进行预测与使用Intel CPU/Nvidia GPU预测相同，支持飞桨原生推理库(PaddleInference)，适用于高性能服务器端、云端推理。当前飞桨框架 ROCm版本完全兼容飞桨框架 CUDA版本的C++/PythonAPI，直接使用原有的GPU预测命令和参数即可。
+# 完整训练及预测示例可参考官网海光DCU芯片运行飞桨文档。
+# https : /  / www.pa ddlepadd le.org.cn /documenta tion/docs/zh/gu ides/hardware_sup port/rocmdocs/trainexamplecn.html
+# http s : / /w w  w.pa   d dlep ddle.o rg.cn/d ocumenta tion/docs zh/guides/h ardware_support/rocmdocs/infer_example_cn.html
+# SqueezeNet, EfficientNet
+# Detection : FasterRCNN, MTCNN, SSD, YOLO
+# 支持深度学习领域基本的K8S调度以及高性能计算场景下常用的SLURM调度。
+#
+# """
+# remove_extra_spaces_html_txt(text)
--- a/magic_pdf/pre_proc/__init__.py
+++ b/magic_pdf/pre_proc/__init__.py
--- a/magic_pdf/pre_proc/citationmarker_remove.py
+++ b/magic_pdf/pre_proc/citationmarker_remove.py
+"""
+去掉正文的引文引用marker
+https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
+"""
+import re
+# from magic_pdf.libs.nlp_utils import NLPModels
+
+
+# __NLP_MODEL = NLPModels()
+
+def check_1(spans, cur_span_i):
+    """寻找前一个char,如果是句号，逗号，那么就是角标"""
+    if cur_span_i==0:
+        return False # 不是角标
+    pre_span = spans[cur_span_i-1]
+    pre_char = pre_span['chars'][-1]['c']
+    if pre_char in ['。', '，', '.', ',']:
+        return True
+    
+    return False
+
+
+# def check_2(spans, cur_span_i):
+#     """检查前面一个span的最后一个单词，如果长度大于5，全都是字母，并且不含大写，就是角标"""
+#     pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
+#
+#     if cur_span_i==0 and len(spans)>1:
+#         next_span = spans[cur_span_i+1]
+#         next_txt = "".join([c['c'] for c in next_span['chars']])
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, next_txt):
+#             return True
+#
+#         return False # 不是角标
+#     elif cur_span_i==0 and len(spans)==1: # 角标占用了整行？谨慎删除
+#         return False
+#
+#     # 如果这个span是最后一个span,
+#     if cur_span_i==len(spans)-1:
+#         pre_span = spans[cur_span_i-1]
+#         pre_txt = "".join([c['c'] for c in pre_span['chars']])
+#         pre_word = pre_txt.split(' ')[-1]
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, pre_txt):
+#             return True
+#
+#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+#     else: # 既不是第一个span，也不是最后一个span，那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
+#         pre_span = spans[cur_span_i-1]
+#         next_span = spans[cur_span_i+1]
+#         cur_span = spans[cur_span_i]
+#         # 找到前一个和后一个span里的距离最近的单词
+#         pre_distance = 10000 # 一个很大的数
+#         next_distance = 10000 # 一个很大的数
+#         for c in pre_span['chars'][::-1]:
+#             if c['c'].isalpha():
+#                 pre_distance = cur_span['bbox'][0] - c['bbox'][2]
+#                 break
+#         for c in next_span['chars']:
+#             if c['c'].isalpha():
+#                 next_distance = c['bbox'][0] - cur_span['bbox'][2]
+#                 break
+#
+#         if pre_distance<next_distance:
+#             belong_to_span = pre_span
+#         else:
+#             belong_to_span = next_span
+#
+#         txt = "".join([c['c'] for c in belong_to_span['chars']])
+#         pre_word = txt.split(' ')[-1]
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, txt):
+#             return True
+#
+#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+
+
+def check_3(spans, cur_span_i):
+    """上标里有[], 有*， 有-， 有逗号"""
+    # 如[2-3],[22]  
+    # 如 2,3,4
+    cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
+    bad_char = ['[', ']', '*', ',']
+
+    if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
+        return True
+
+    # 如2-3, a-b
+    patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
+    for pattern in patterns:  
+        match = re.match(pattern, cur_span_txt)
+        if match is not None:
+            return True
+
+    return False
+
+
+def remove_citation_marker(with_char_text_blcoks):
+    for blk in with_char_text_blcoks:
+        for line in blk['lines']:
+            # 如果span里的个数少于2个，那只能忽略，角标不可能自己独占一行
+            if len(line['spans'])<=1:
+                continue
+
+            # 找到高度最高的span作为位置比较的基准
+            max_hi_span = line['spans'][0]['bbox']
+            min_font_sz = 10000 # line里最小的字体
+            max_font_sz = 0   # line里最大的字体
+                
+            for s in line['spans']:
+                if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
+                    max_hi_span = s['bbox']
+                if min_font_sz>s['size']:
+                    min_font_sz = s['size']
+                if max_font_sz<s['size']:
+                    max_font_sz = s['size']
+                        
+            base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
+            
+            
+            span_to_del = []
+            for i, span in enumerate(line['spans']):
+                span_hi = span['bbox'][3]-span['bbox'][1]
+                span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
+                span_font_sz = span['size']
+                
+                if max_font_sz-span_font_sz<1: # 先以字体过滤正文，如果是正文就不再继续判断了
+                    continue
+
+                # 对被除数为0的情况进行过滤
+                if span_hi==0 or min_font_sz==0:
+                    continue
+
+                if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
+                    """
+                    1. 它的前一个char如果是句号或者逗号的话，那么肯定是角标而不是公式
+                    2. 如果这个角标的前面是一个单词（长度大于5）而不是任何大写或小写的短字母的话 应该也是角标
+                    3. 上标里有数字和逗号或者数字+星号的组合，方括号，一般肯定就是角标了
+                    4. 这个角标属于前文还是后文要根据距离来判断，如果距离前面的文本太近，那么就是前面的角标，否则就是后面的角标
+                    """
+                    if (check_1(line['spans'], i) or
+                        # check_2(line['spans'], i) or
+                        check_3(line['spans'], i)
+                    ):
+                        """删除掉这个角标：删除这个span, 同时还要更新line的text"""
+                        span_to_del.append(span)
+            if len(span_to_del)>0:
+                for span in span_to_del:
+                    line['spans'].remove(span)
+                line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
+    
+    return with_char_text_blcoks
--- a/magic_pdf/pre_proc/construct_page_dict.py
+++ b/magic_pdf/pre_proc/construct_page_dict.py
+def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
+                             interline_eq_info, raw_pymu_blocks,
+                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
+                             layout_tree,
+                             page_w, page_h, footnote_bboxes_tmp):
+    """
+    
+    """
+    return_dict = {}
+
+    return_dict['para_blocks'] = {}
+    return_dict['preproc_blocks'] = text_blocks_preproc
+    return_dict['images'] = image_info
+    return_dict['tables'] = table_info
+    return_dict['interline_equations'] = interline_eq_info
+    return_dict['inline_equations'] = inline_eq_info
+    return_dict['layout_bboxes'] = layout_bboxes
+    return_dict['pymu_raw_blocks'] = raw_pymu_blocks
+    return_dict['global_statistic'] = {}
+
+    return_dict['droped_text_block'] = removed_text_blocks
+    return_dict['droped_image_block'] = removed_image_blocks
+    return_dict['droped_table_block'] = []
+    return_dict['image_backup'] = images_backup
+    return_dict['table_backup'] = []
+    return_dict['page_idx'] = page_id
+    return_dict['page_size'] = [page_w, page_h]
+    return_dict['_layout_tree'] = layout_tree  # 辅助分析layout作用
+    return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
+
+    return return_dict
+
+
+def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                 images, tables, interline_equations, inline_equations,
+                                 dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
+                                 need_remove_spans_bboxes_dict):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'layout_bboxes': layout_bboxes,
+        'page_idx': page_id,
+        'page_size': [page_w, page_h],
+        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'inline_equations': inline_equations,
+        'droped_text_block': dropped_text_block,
+        'droped_image_block': dropped_image_block,
+        'droped_table_block': dropped_table_block,
+        'dropped_equation_block': dropped_equation_block,
+        'droped_bboxes': need_remove_spans_bboxes_dict,
+    }
+    return return_dict
+
+
+def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                    images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'layout_bboxes': layout_bboxes,
+        'page_idx': page_id,
+        'page_size': [page_w, page_h],
+        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'discarded_blocks': discarded_blocks,
+        'need_drop': need_drop,
+        'drop_reason': drop_reason,
+    }
+    return return_dict
--- a/magic_pdf/pre_proc/cut_image.py
+++ b/magic_pdf/pre_proc/cut_image.py
+from loguru import logger
+import configparser
+import os
+import time
+
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.pdf_image_tools import cut_image
+
+from multiprocessing import Pool
+# vllm：
+from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
+
+# 普通 非vllm
+# from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
+
+text = '解析图片内容，直接返回一段带有逻辑性的中文书面语描述，要求表达精准，不脱离图片中的实际内容，不要带换行,文中所有的名词不要用指代词'
+client = None
+def ocr_image(image_path):
+    start = time.time()
+    compress_image(image_path)
+    txt = os.getpid()
+    # global client
+    #logger.info(f'image_path:{image_path}')
+    generated_text = f'--【{txt}】--\n'+client.predict(image_path, text)
+    end = time.time()
+    logger.info(f'qwen解析{image_path}表格的内容为：{generated_text},耗时为：{end - start}')
+    return generated_text
+
+def ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, page, page_id, pdf_bytes_md5, imageWriter):
+    def return_path(type):
+        return join_path(pdf_bytes_md5, type)
+
+    pool = Pool(4)
+    global client
+    config = configparser.ConfigParser()
+    config.read(config_path)
+    url = config.get('server', 'ocr_server')
+    client = PredictClient(url)
+
+    if not ocr_status:
+        logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
+        logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
+
+    for span in spans:
+        span_type = span['type']
+        if span_type == ContentType.Image:
+            if not check_img_bbox(span['bbox']):
+                continue
+            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
+                                           imageWriter=imageWriter)
+        elif span_type == ContentType.Table:
+            if not check_img_bbox(span['bbox']):
+                continue
+            image_path = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
+                                   imageWriter=imageWriter)
+            image_path = join_path(local_image_dir, image_path)
+            if ocr_status:
+
+                txt = pool.apply_async(ocr_image, args=(image_path,)).get()
+                span['image_path'] = str(txt)
+            else:
+                span['image_path'] = f"----------------图片路径为({image_path})，请检查qwen ocr服务，重新运行文件解析-------------------  \n"
+    pool.close()
+    pool.join()
+
+    return spans
+
+
+def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
+                              image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
+                              equation_inline_bboxes: list,
+                              equation_interline_bboxes: list, imageWriter) -> dict:
+    """
+    返回一个dict, key为bbox, 值是图片地址
+    """
+    image_info = []
+    image_backup_info = []
+    table_info = []
+    inline_eq_info = []
+    interline_eq_info = []
+
+    # 图片的保存路径组成是这样的： {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
+
+    def return_path(type):
+        return join_path(pdf_bytes_md5, type)
+
+    for bbox in image_bboxes:
+        if not check_img_bbox(bbox):
+            continue
+        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
+        image_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in images_overlap_backup:
+        if not check_img_bbox(bbox):
+            continue
+        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
+        image_backup_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in table_bboxes:
+        if not check_img_bbox(bbox):
+            continue
+        image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
+        table_info.append({"bbox": bbox, "image_path": image_path})
+
+    return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
+
+
+def check_img_bbox(bbox) -> bool:
+    if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
+        logger.warning(f"image_bboxes: 错误的box, {bbox}")
+        return False
+    return True
--- a/magic_pdf/pre_proc/detect_equation.py
+++ b/magic_pdf/pre_proc/detect_equation.py
+from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio              # 正则
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+
+
+def __solve_contain_bboxs(all_bbox_list: list):
+
+    """将两个公式的bbox做判断是否有包含关系，若有的话则删掉较小的bbox"""
+
+    dump_list = []
+    for i in range(len(all_bbox_list)):
+        for j in range(i + 1, len(all_bbox_list)):
+            # 获取当前两个值
+            bbox1 = all_bbox_list[i][:4]
+            bbox2 = all_bbox_list[j][:4]
+            
+            # 删掉较小的框
+            if _is_in(bbox1, bbox2):
+                dump_list.append(all_bbox_list[i])
+            elif _is_in(bbox2, bbox1):
+                dump_list.append(all_bbox_list[j])
+            else:
+                ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+                if ratio > 0.7:
+                    s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) 
+                    s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+                    if s2 > s1:  
+                        dump_list.append(all_bbox_list[i])
+                    else:
+                        dump_list.append(all_bbox_list[i]) 
+
+    # 遍历需要删除的列表中的每个元素
+    for item in dump_list:
+        
+        while item in all_bbox_list:
+            all_bbox_list.remove(item)
+    return all_bbox_list
+
+
+def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    
+
+    #--------- 通过json_from_DocXchain来获取 table ---------#
+    equationEmbedding_from_DocXChain_bboxs = []
+    equationIsolated_from_DocXChain_bboxs = []
+    
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # equation
+        img_suffix = f"{page_ID}_{int(L)}_{int(U)}_{int(R)}_{int(D)}"
+        if xf['category_id'] == 13 and xf['score'] >= 0.3:      
+            latex_text = xf.get("latex", "EmptyInlineEquationResult")
+            debugable_latex_text = f"{latex_text}|{img_suffix}"
+            equationEmbedding_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
+        if xf['category_id'] == 14 and xf['score'] >= 0.3:
+            latex_text = xf.get("latex", "EmptyInterlineEquationResult")
+            debugable_latex_text = f"{latex_text}|{img_suffix}"
+            equationIsolated_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
+    
+    #---------------------------------------- 排序，编号，保存 -----------------------------------------#
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    
+    equationEmbedding_from_DocXChain_names = []
+    equationEmbedding_ID = 0
+    
+    equationIsolated_from_DocXChain_names = []
+    equationIsolated_ID = 0
+    
+    for L, U, R, D, _ in equationEmbedding_from_DocXChain_bboxs:
+        if not(L < R and U < D):
+            continue
+        try:
+            # cur_equation = page.get_pixmap(clip=(L,U,R,D))
+            new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationEmbedding_ID)        # 公式name
+            # cur_equation.save(res_dir_path + '/' + new_equation_name)                       # 把公式存出在新建的文件夹，并命名
+            equationEmbedding_from_DocXChain_names.append(new_equation_name)                         # 把公式的名字存在list中，方便在md中插入引用
+            equationEmbedding_ID += 1
+        except:
+            pass
+
+    for L, U, R, D, _ in equationIsolated_from_DocXChain_bboxs:
+        if not(L < R and U < D):
+            continue
+        try:
+            # cur_equation = page.get_pixmap(clip=(L,U,R,D))
+            new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationIsolated_ID)        # 公式name
+            # cur_equation.save(res_dir_path + '/' + new_equation_name)                       # 把公式存出在新建的文件夹，并命名
+            equationIsolated_from_DocXChain_names.append(new_equation_name)                         # 把公式的名字存在list中，方便在md中插入引用
+            equationIsolated_ID += 1
+        except:
+            pass
+    
+    equationEmbedding_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    
+    
+    """根据pdf可视区域，调整bbox的坐标"""
+    cropbox = page.cropbox
+    if cropbox[0]!=page.rect[0] or cropbox[1]!=page.rect[1]:
+        for eq_box in equationEmbedding_from_DocXChain_bboxs:
+            eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
+        for eq_box in equationIsolated_from_DocXChain_bboxs:
+            eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
+        
+    deduped_embedding_eq_bboxes = __solve_contain_bboxs(equationEmbedding_from_DocXChain_bboxs)
+    return deduped_embedding_eq_bboxes, equationIsolated_from_DocXChain_bboxs
--- a/magic_pdf/pre_proc/detect_footer_by_model.py
+++ b/magic_pdf/pre_proc/detect_footer_by_model.py
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 footer ---------#
+    footer_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 6 and xf['score'] >= 0.3:
+            footer_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    footer_final_names = []
+    footer_final_bboxs = []
+    footer_ID = 0
+    for L, U, R, D in footer_bbox_from_DocXChain:
+        # cur_footer = page.get_pixmap(clip=(L,U,R,D))
+        new_footer_name = "footer_{}_{}.png".format(page_ID, footer_ID)    # 脚注name
+        # cur_footer.save(res_dir_path + '/' + new_footer_name)           # 把页脚存储在新建的文件夹，并命名
+        footer_final_names.append(new_footer_name)                        # 把脚注的名字存在list中
+        footer_final_bboxs.append((L, U, R, D))
+        footer_ID += 1
+        
+
+    footer_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_footer_bboxs = footer_final_bboxs
+    return curPage_all_footer_bboxs
+