Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files

Update magic_pdf/init.py, magic_pdf/config.ini, magic_pdf/tmp.py,...
Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
2df265c8 · zhougaofeng · 826086d2 · 2df265c8 · 2df265c8 · 2df265c8
Commit 2df265c8 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/pre_proc/detect_footer_header_by_statistics.py
+++ b/magic_pdf/pre_proc/detect_footer_header_by_statistics.py
+from collections import defaultdict
+
+from magic_pdf.libs.boxbase import calculate_iou
+
+
+def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
+    return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
+
+def is_single_line_block(block):
+    # Determine based on the width and height of the block
+    block_width = block["X1"] - block["X0"]
+    block_height = block["bbox"][3] - block["bbox"][1]
+
+    # If the height of the block is close to the average character height and the width is large, it is considered a single line
+    return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
+
+def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
+    """
+    This function gets the most common bboxes from the bboxes
+
+    Parameters
+    ----------
+    bboxes : list
+        bboxes
+    page_height : float
+        height of the page
+    position : str, optional
+        "top" or "bottom", by default "top"
+    threshold : float, optional
+        threshold, by default 0.25
+    num_bboxes : int, optional
+        number of bboxes to return, by default 3
+    min_frequency : int, optional
+        minimum frequency of the bbox, by default 2
+
+    Returns
+    -------
+    common_bboxes : list
+        common bboxes
+    """
+    # Filter bbox by position
+    if position == "top":
+        filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
+    else:
+        filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
+
+    # Find the most common bbox
+    bbox_count = defaultdict(int)
+    for bbox in filtered_bboxes:
+        bbox_count[tuple(bbox)] += 1
+
+    # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
+    common_bboxes = [
+        bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
+    ][:num_bboxes]
+    return common_bboxes
+
+def detect_footer_header2(result_dict, similarity_threshold=0.5):
+    """
+    This function detects the header and footer of the document.
+
+    Parameters
+    ----------
+    result_dict : dict
+        result dictionary
+
+    Returns
+    -------
+    result_dict : dict
+        result dictionary
+    """
+    # Traverse all blocks in the document
+    single_line_blocks = 0
+    total_blocks = 0
+    single_line_blocks = 0
+
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    total_blocks += 1
+                    if is_single_line_block(block):
+                        single_line_blocks += 1
+
+    # If there are no blocks, skip the header and footer detection
+    if total_blocks == 0:
+        print("No blocks found. Skipping header/footer detection.")
+        return result_dict
+
+    # If most of the blocks are single-line, skip the header and footer detection
+    if single_line_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
+        # print("Skipping header/footer detection for text-dense document.")
+        return result_dict
+
+    # Collect the bounding boxes of all blocks
+    all_bboxes = []
+    all_texts = []
+
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    all_bboxes.append(block["bbox"])
+
+    # Get the height of the page
+    page_height = max(bbox[3] for bbox in all_bboxes)
+
+    # Get the most common bbox lists for headers and footers
+    common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
+    common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
+
+    # Detect and mark headers and footers
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    bbox = block["bbox"]
+                    text = block["text"]
+
+                    is_header = compare_bbox_with_list(bbox, common_header_bboxes)
+                    is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
+                    block["is_header"] = int(is_header)
+                    block["is_footer"] = int(is_footer)
+
+    return result_dict
+
+
+def __get_page_size(page_sizes:list):
+    """
+    页面大小可能不一样
+    """
+    w = sum([w for w,h in page_sizes])/len(page_sizes)
+    h = sum([h for w,h  in page_sizes])/len(page_sizes)
+    return w, h
+
+def __calculate_iou(bbox1, bbox2):
+    iou = calculate_iou(bbox1, bbox2)
+    return iou
+
+def __is_same_pos(box1, box2, iou_threshold):
+    iou = __calculate_iou(box1, box2)
+    return iou >= iou_threshold
+
+
+def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int,  page_range_threshold=0.2, iou_threshold=0.9):
+    """
+    common bbox必须大于page_cnt的1/3
+    """
+    min_occurance_cnt = max(3, page_cnt//4)
+    header_det_bbox = []
+    footer_det_bbox = []
+    
+    hdr_same_pos_group = []
+    btn_same_pos_group = []
+    
+    page_w, page_h = __get_page_size(page_size)
+    top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
+    
+    top_bbox = [b for b in bboxes if b[3]<top_y]
+    bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
+    # 然后开始排序，寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
+    for i in range(0, len(top_bbox)):
+        hdr_same_pos_group.append([top_bbox[i]])
+        for j in range(i+1, len(top_bbox)):
+            if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
+                #header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
+                hdr_same_pos_group[i].append(top_bbox[j])
+                
+    for i in range(0, len(bottom_bbox)):
+        btn_same_pos_group.append([bottom_bbox[i]])
+        for j in range(i+1, len(bottom_bbox)):
+            if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
+                #footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
+                btn_same_pos_group[i].append(bottom_bbox[j])
+                
+    # 然后看下每一组的bbox，是否符合大于page_cnt一定比例
+    hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
+    btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
+    
+    # 平铺2个list[list]
+    hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
+    btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
+    # 寻找hdr_same_pos_group中的box[3]最大值，btn_same_pos_group中的box[1]最小值
+    hdr_same_pos_group.sort(key=lambda b:b[3])
+    btn_same_pos_group.sort(key=lambda b:b[1])
+    
+    hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
+    btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
+    
+    header_det_bbox = [0, 0, page_w, hdr_y]
+    footer_det_bbox = [0, btn_y, page_w, page_h]
+    # logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
+    return header_det_bbox, footer_det_bbox, page_w, page_h
+    
+
+def drop_footer_header(pdf_info_dict:dict):
+    """
+    启用规则探测,在全局的视角上通过统计的方法。
+    """
+    header = []
+    footer = []
+    
+    all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
+    image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
+    page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
+    page_cnt = len(pdf_info_dict.keys()) # 一共多少页
+    header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
+    
+    """"
+    把范围扩展到页面水平的整个方向上
+    """        
+    if header:
+        header = [0, 0, page_w, header[3]+1]
+        
+    if footer:
+        footer = [0, footer[1]-1, page_w, page_h]
+        
+    # 找到footer, header范围之后，针对每一页pdf，从text、图片中删除这些范围内的内容
+    # 移除text block
+    
+    for _, page_info in pdf_info_dict.items():
+        header_text_blk = []
+        footer_text_blk = []
+        for blk in page_info['preproc_blocks']:
+            blk_bbox = blk['bbox']
+            if header and blk_bbox[3]<=header[3]:
+                blk['tag'] = "header"
+                header_text_blk.append(blk)
+            elif footer and blk_bbox[1]>=footer[1]:
+                blk['tag'] = "footer"
+                footer_text_blk.append(blk)
+                
+        # 放入text_block_droped中
+        page_info['droped_text_block'].extend(header_text_blk)
+        page_info['droped_text_block'].extend(footer_text_blk)
+        
+        for blk in header_text_blk:
+            page_info['preproc_blocks'].remove(blk)
+        for blk in footer_text_blk:
+            page_info['preproc_blocks'].remove(blk)
+            
+        """接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
+        header_image = []
+        footer_image = []
+        
+        for image_info in page_info['images']:
+            img_bbox = image_info['bbox']
+            if header and img_bbox[3]<=header[3]:
+                image_info['tag'] = "header"
+                header_image.append(image_info)
+            elif footer and img_bbox[1]>=footer[1]:
+                image_info['tag'] = "footer"
+                footer_image.append(image_info)
+                
+        page_info['droped_image_block'].extend(header_image)
+        page_info['droped_image_block'].extend(footer_image)
+        
+        for img in header_image:
+            page_info['images'].remove(img)
+        for img in footer_image:
+            page_info['images'].remove(img)
+            
+        """接下来吧backup的图片也删除掉"""
+        header_image = []
+        footer_image = []
+        
+        for image_info in page_info['image_backup']:
+            img_bbox = image_info['bbox']
+            if header and img_bbox[3]<=header[3]:
+                image_info['tag'] = "header"
+                header_image.append(image_info)
+            elif footer and img_bbox[1]>=footer[1]:
+                image_info['tag'] = "footer"
+                footer_image.append(image_info)
+                
+        page_info['droped_image_block'].extend(header_image)
+        page_info['droped_image_block'].extend(footer_image)
+        
+        for img in header_image:
+            page_info['image_backup'].remove(img)
+        for img in footer_image:
+            page_info['image_backup'].remove(img)
+            
+    return header, footer
--- a/magic_pdf/pre_proc/detect_footnote.py
+++ b/magic_pdf/pre_proc/detect_footnote.py
+from collections import Counter
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 footnote ---------#
+    footnote_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # if xf['category_id'] == 5 and xf['score'] >= 0.3:
+        if xf['category_id'] == 5 and xf['score'] >= 0.43:  # 新的footnote阈值
+            footnote_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    footnote_final_names = []
+    footnote_final_bboxs = []
+    footnote_ID = 0
+    for L, U, R, D in footnote_bbox_from_DocXChain:
+        if debug_mode:
+            # cur_footnote = page.get_pixmap(clip=(L,U,R,D))
+            new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID)    # 脚注name
+            # cur_footnote.save(md_bookname_save_path + '/' + new_footnote_name)           # 把脚注存储在新建的文件夹，并命名
+            footnote_final_names.append(new_footnote_name)                        # 把脚注的名字存在list中
+        footnote_final_bboxs.append((L, U, R, D))
+        footnote_ID += 1
+        
+
+    footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_footnote_bboxs = footnote_final_bboxs
+    return curPage_all_footnote_bboxs
+
+
+def need_remove(block):
+    if 'lines' in block and len(block['lines']) > 0:
+        # block中只有一行，且该行文本全是大写字母，或字体为粗体bold关键词，SB关键词，把这个block捞回来
+        if len(block['lines']) == 1:
+            if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1:
+                font_keywords = ['SB', 'bold', 'Bold']
+                if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords):
+                    return True
+        for line in block['lines']:
+            if 'spans' in line and len(line['spans']) > 0:
+                for span in line['spans']:
+                    # 检测"keyword"是否在span中，忽略大小写
+                    if "keyword" in span['text'].lower():
+                        return True
+    return False
+
+def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font):
+    """
+    根据给定的文本块、页高和页码，解析出符合规则的脚注文本块，并返回其边界框。
+
+    Args:
+        remain_text_blocks (list): 包含所有待处理的文本块的列表。
+        page_height (float): 页面的高度。
+        page_id (int): 页面的ID。
+
+    Returns:
+        list: 符合规则的脚注文本块的边界框列表。
+
+    """
+    # if page_id > 20:
+    if page_id > 2:  # 为保证精确度，先只筛选前3页
+        return []
+    else:
+        # 存储每一行的文本块大小的列表
+        line_sizes = []
+        # 存储每个文本块的平均行大小
+        block_sizes = []
+        # 存储每一行的字体信息
+        # font_names = []
+        font_names = Counter()
+        if len(remain_text_blocks) > 0:
+            for block in remain_text_blocks:
+                block_line_sizes = []
+                # block_fonts = []
+                block_fonts = Counter()
+                for line in block['lines']:
+                    # 提取每个span的size属性，并计算行大小
+                    span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
+                    if span_sizes:
+                        line_size = sum(span_sizes) / len(span_sizes)
+                        line_sizes.append(line_size)
+                        block_line_sizes.append(line_size)
+                    span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
+                    if span_font:
+                        #  main_text_font应该用基于字数最多的字体而不是span级别的统计
+                        # font_names.append(font_name for font_name in span_font)
+                        # block_fonts.append(font_name for font_name in span_font)
+                        for font, count in span_font:
+                            # font_names.extend([font] * count)
+                            # block_fonts.extend([font] * count)
+                            font_names[font] += count
+                            block_fonts[font] += count
+                if block_line_sizes:
+                    # 计算文本块的平均行大小
+                    block_size = sum(block_line_sizes) / len(block_line_sizes)
+                    # block_font = collections.Counter(block_fonts).most_common(1)[0][0]
+                    block_font = block_fonts.most_common(1)[0][0]
+                    block_sizes.append((block, block_size, block_font))
+
+            # 计算main_text_size
+            main_text_size = Counter(line_sizes).most_common(1)[0][0]
+            # 计算main_text_font
+            # main_text_font = collections.Counter(font_names).most_common(1)[0][0]
+            # main_text_font = font_names.most_common(1)[0][0]
+            # 删除一些可能被误识别为脚注的文本块
+            block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)]
+
+            # 检测footnote_block 并返回 footnote_bboxes
+            # footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
+            #                    block['bbox'][1] > page_height * 0.6 and block_size < main_text_size
+            #                    and (len(block['lines']) < 5 or block_font != main_text_font)]
+                               # and len(block['lines']) < 5]
+            footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
+                               block['bbox'][1] > page_height * 0.6 and
+                               #  较为严格的规则
+                               block_size < main_text_size and
+                               (len(block['lines']) < 5 or
+                                block_font != main_text_font)]
+
+                               #  较为宽松的规则
+                               # sum([block_size < main_text_size,
+                               #      len(block['lines']) < 5,
+                               #      block_font != main_text_font])
+                               # >= 2]
+
+
+            return footnote_bboxes
+        else:
+            return []
+
+
+
--- a/magic_pdf/pre_proc/detect_header.py
+++ b/magic_pdf/pre_proc/detect_header.py
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 header ---------#
+    header_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 3 and xf['score'] >= 0.3:
+            header_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    header_final_names = []
+    header_final_bboxs = []
+    header_ID = 0
+    for L, U, R, D in header_bbox_from_DocXChain:
+        # cur_header = page.get_pixmap(clip=(L,U,R,D))
+        new_header_name = "header_{}_{}.png".format(page_ID, header_ID)    # 页眉name
+        # cur_header.save(res_dir_path + '/' + new_header_name)           # 把页眉存储在新建的文件夹，并命名
+        header_final_names.append(new_header_name)                        # 把页面的名字存在list中
+        header_final_bboxs.append((L, U, R, D))
+        header_ID += 1
+        
+
+    header_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_header_bboxs = header_final_bboxs
+    return curPage_all_header_bboxs
+
--- a/magic_pdf/pre_proc/detect_images.py
+++ b/magic_pdf/pre_proc/detect_images.py
+import collections      # 统计库
+import re
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+
+
+#--------------------------------------- Tool Functions --------------------------------------#
+# 正则化，输入文本，输出只保留a-z,A-Z,0-9
+def remove_special_chars(s: str) -> str:
+    pattern = r"[^a-zA-Z0-9]"
+    res = re.sub(pattern, "", s)
+    return res
+
+def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1和rect2是否一模一样
+    return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
+
+def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1包含了rect2
+    return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
+
+def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1与rect2是否存在重叠（只有一条边重叠，也算重叠）
+    return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
+
+def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
+    # 计算两个rect，重叠面积各占2个rect面积的比例
+    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
+        return 0, 0
+    square_1 = (R1 - L1) * (D1 - U1)
+    square_2 = (R2 - L2) * (D2 - U2)
+    if square_1 == 0 or square_2 == 0:
+        return 0, 0
+    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
+    return square_overlap / square_1, square_overlap / square_2
+
+def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
+    # 计算两个line，重叠区间各占2个line长度的比例
+    if max(L1, L2) > min(R1, R2):
+        return 0, 0
+    if L1 == R1 or L2 == R2:
+        return 0, 0
+    overlap_line = min(R1, R2) - max(L1, L2)
+    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
+
+
+# 判断rect其实是一条line
+def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
+    width = R - L
+    height = D - U
+    if width <= 3 or height <= 3:
+        return True
+    if width / height >= 30 or height / width >= 30:
+        return True
+
+
+
+def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    #### 通过fitz获取page信息
+    ## 超越边界
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    
+    #----------------- 保存每一个文本块的LURD ------------------#
+    textLine_blocks = []
+    blocks = page.get_text(
+            "dict",
+            flags=fitz.TEXTFLAGS_TEXT,
+            #clip=clip,
+        )["blocks"]
+    for i in range(len(blocks)):
+        bbox = blocks[i]['bbox']
+        # print(bbox)
+        for tt in blocks[i]['lines']:
+            # 当前line
+            cur_line_bbox = None                            # 当前line，最右侧的section的bbox
+            for xf in tt['spans']:
+                L, U, R, D = xf['bbox']
+                L, R = min(L, R), max(L, R)
+                U, D = min(U, D), max(U, D)
+                textLine_blocks.append((L, U, R, D))
+    textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    
+
+    #---------------------------------------------- 保存img --------------------------------------------------#
+    raw_imgs = page.get_images()                    # 获取所有的图片
+    imgs = []
+    img_names = []                              # 保存图片的名字，方便在md中插入引用
+    img_bboxs = []                              # 保存图片的location信息。
+    img_visited = [] # 记忆化，记录该图片是否在md中已经插入过了
+    img_ID = 0
+
+    ## 获取、保存每张img的location信息(x1, y1, x2, y2， UL, DR坐标)
+    for i in range(len(raw_imgs)):
+        # 如果图片在junklist中则跳过
+        if raw_imgs[i][0] in junk_img_bojids:
+            continue
+        else:
+            try:
+                tt = page.get_image_rects(raw_imgs[i][0], transform = True)
+
+                rec = tt[0][0]
+                L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
+
+                L, R = min(L, R), max(L, R)
+                U, D = min(U, D), max(U, D)
+                if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
+                    continue
+                if pageL == L and R == pageR:
+                    continue
+                if pageU == U and D == pageD:
+                    continue
+                # pix1 = page.get_Pixmap(clip=(L,U,R,D))
+                new_img_name = "{}_{}.png".format(page_ID, i)      # 图片name
+                # pix1.save(res_dir_path + '/' + new_img_name)        # 把图片存出在新建的文件夹，并命名
+                img_names.append(new_img_name)
+                img_bboxs.append((L, U, R, D))
+                img_visited.append(False)
+                imgs.append(raw_imgs[i])
+            except:
+                continue
+    
+    #-------- 如果img之间有重叠。说明获取的img大小有问题，位置也不一定对。就扔掉--------#
+    imgs_ok = [True for _ in range(len(imgs))]
+    for i in range(len(imgs)):
+        L1, U1, R1, D1 = img_bboxs[i]
+        for j in range(i + 1, len(imgs)):
+            L2, U2, R2, D2 = img_bboxs[j]
+            ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            if ratio_1 > 0 and ratio_2 > 0:
+                if ratio_1 == 1 and ratio_2 > 0.8:
+                    imgs_ok[i] = False
+                elif ratio_1 > 0.8 and ratio_2 == 1:
+                    imgs_ok[j] = False 
+                elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
+                    imgs_ok[i] = False
+                    imgs_ok[j] = False
+                elif s1 / s2 > 5 and ratio_2 > 0.5:
+                    imgs_ok[j] = False
+                elif s2 / s1 > 5 and ratio_1 > 0.5:
+                    imgs_ok[i] = False
+                    
+    imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    #*******************************************************************************#
+    
+    #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
+    #
+    svgs = page.get_drawings()
+    #------------ preprocess, check一些大框，看是否是合理的 ----------#
+    ## 去重。有时候会遇到rect1和rect2是完全一样的情形。
+    svg_rect_visited = set()
+    available_svgIdx = []
+    for i in range(len(svgs)):
+        L, U, R, D = svgs[i]['rect'].irect
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        tt = (L, U, R, D)
+        if tt not in svg_rect_visited:
+            svg_rect_visited.add(tt)
+            available_svgIdx.append(i)
+        
+    svgs = [svgs[i] for i in available_svgIdx]                  # 去重后，有效的svgs
+    svg_childs = [[] for _ in range(len(svgs))]
+    svg_parents = [[] for _ in range(len(svgs))]
+    svg_overlaps = [[] for _ in range(len(svgs))]            #svg_overlaps[i]是一个list，存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
+    svg_visited = [False for _ in range(len(svgs))]
+    svg_exceedPage = [0 for _ in range(len(svgs))]       # 是否超越边界（artbox），很大，但一般是一个svg的底。  
+        
+    
+    for i in range(len(svgs)):
+        L, U, R, D = svgs[i]['rect'].irect
+        ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
+        if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
+            if ratio_2 >= 0.7:
+                svg_exceedPage[i] += 4
+        else:
+            if L <= pageL:
+                svg_exceedPage[i] += 1
+            if pageR <= R:
+                svg_exceedPage[i] += 1
+            if U <= pageU:
+                svg_exceedPage[i] += 1
+            if pageD <= D:
+                svg_exceedPage[i] += 1
+            
+    #### 如果有≥2个的超边界的框，就不要手写规则判断svg了。很难写对。
+    if len([x for x in svg_exceedPage if x >= 1]) >= 2:
+        svgs = []
+        svg_childs = []
+        svg_parents = []
+        svg_overlaps = []
+        svg_visited = []
+        svg_exceedPage = []  
+            
+    #---------------------------- build graph ----------------------------#
+    for i, p in enumerate(svgs):
+        L1, U1, R1, D1 = svgs[i]["rect"].irect
+        for j in range(len(svgs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = svgs[j]["rect"].irect
+            ## 包含
+            if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                svg_childs[i].append(j)
+                svg_parents[j].append(i)
+            else:
+                ## 交叉
+                if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                    svg_overlaps[i].append(j)
+
+    #---------------- 确定最终的svg。连通块儿的外围 -------------------#
+    eps_ERROR = 5                      # 给识别出的svg，四周留白（为了防止pyMuPDF的rect不准）
+    svg_ID = 0        
+    svg_final_names = []
+    svg_final_bboxs = []
+    svg_final_visited = []              # 为下面，text识别左准备。作用同img_visited
+    
+    svg_idxs = [i for i in range(len(svgs))]
+    svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1]))   # 按照面积，从大到小排序
+     
+    for i in svg_idxs:
+        if svg_visited[i] == True:
+            continue
+        svg_visited[i] = True
+        L, U, R, D = svgs[i]['rect'].irect
+        width = R - L
+        height = D - U
+        if check_rect_isLine(L, U, R, D) == True:
+            svg_visited[i] = False
+            continue
+        # if i == 4:
+        #     print(i, L, U, R, D)
+        #     print(svg_parents[i])
+        
+        cur_block_element_cnt = 0               # 当前要判定为svg的区域中，有多少elements，最外围的最大svg框除外。
+        if len(svg_parents[i]) == 0:
+            ## 是个普通框的情形
+            cur_block_element_cnt += len(svg_childs[i])
+            if svg_exceedPage[i] == 0:
+                ## 误差。可能已经包含在某个框里面了
+                neglect_flag = False
+                for pL, pU, pR, pD in svg_final_bboxs:
+                    if pL <= L <= R <= pR and pU <= U <= D <= pD:
+                        neglect_flag = True
+                        break
+                if neglect_flag == True:
+                    continue
+                
+                ## 搜索连通域, bfs+记忆化
+                q = collections.deque()
+                for j in svg_overlaps[i]:
+                    q.append(j)
+                while q:
+                    j = q.popleft()
+                    svg_visited[j] = True
+                    L2, U2, R2, D2 = svgs[j]['rect'].irect
+                    # width2 = R2 - L2
+                    # height2 = D2 - U2
+                    # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
+                    #     continue
+                    L = min(L, L2)
+                    R = max(R, R2)
+                    U = min(U, U2)
+                    D = max(D, D2)
+                    cur_block_element_cnt += 1
+                    cur_block_element_cnt += len(svg_childs[j])
+                    for k in svg_overlaps[j]:
+                        if svg_visited[k] == False and svg_exceedPage[k] == 0:
+                            svg_visited[k] = True
+                            q.append(k)
+            elif svg_exceedPage[i] <= 2:
+                ## 误差。可能已经包含在某个svg_final_bbox框里面了
+                neglect_flag = False
+                for sL, sU, sR, sD in svg_final_bboxs:
+                    if sL <= L <= R <= sR and sU <= U <= D <= sD:
+                        neglect_flag = True
+                        break
+                if neglect_flag == True:
+                    continue
+                
+                L, U, R, D = pageR, pageD, pageL, pageU
+                ## 所有孩子元素的最大边界
+                for j in svg_childs[i]:
+                    if svg_visited[j] == True:
+                        continue
+                    if svg_exceedPage[j] >= 1:
+                        continue
+                    svg_visited[j] = True                       #### 这个位置考虑一下
+                    L2, U2, R2, D2 = svgs[j]['rect'].irect
+                    L = min(L, L2)
+                    R = max(R, R2)
+                    U = min(U, U2)
+                    D = max(D, D2)
+                    cur_block_element_cnt += 1
+                    
+            # 如果是条line，就不用保存了
+            if check_rect_isLine(L, U, R, D) == True:
+                continue
+            # 如果当前的svg，连2个elements都没有，就不用保存了
+            if cur_block_element_cnt < 3:
+                continue
+            
+            ## 当前svg，框住了多少文本框。如果框多了，可能就是错了
+            contain_textLineBlock_cnt = 0
+            for L2, U2, R2, D2 in textLine_blocks:
+                if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
+                    contain_textLineBlock_cnt += 1
+            if contain_textLineBlock_cnt >= 10:
+                continue
+            
+            # L -= eps_ERROR * 2
+            # U -= eps_ERROR
+            # R += eps_ERROR * 2
+            # D += eps_ERROR
+            # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
+            # cur_svg = page.get_pixmap(clip=(L,U,R,D))
+            new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID)      # 图片name
+            # cur_svg.save(res_dir_path + '/' + new_svg_name)        # 把图片存出在新建的文件夹，并命名
+            svg_final_names.append(new_svg_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            svg_final_bboxs.append((L, U, R, D))
+            svg_final_visited.append(False)
+            svg_ID += 1
+    
+    ## 识别出的svg，可能有 包含，相邻的情形。需要进一步合并
+    svg_idxs = [i for i in range(len(svg_final_bboxs))]
+    svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0]))   # (U, L)
+    svg_final_names_2 = []
+    svg_final_bboxs_2 = []
+    svg_final_visited_2 = []              # 为下面，text识别左准备。作用同img_visited
+    svg_ID_2 = 0
+    for i in range(len(svg_final_bboxs)):
+        L1, U1, R1, D1 = svg_final_bboxs[i]
+        for j in range(i + 1, len(svg_final_bboxs)):
+            L2, U2, R2, D2 = svg_final_bboxs[j]
+            # 如果 rect1包含了rect2
+            if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                svg_final_visited[j] = True
+                continue
+            # 水平并列
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
+            if ratio_1 >= 0.7 and ratio_2 >= 0.7:
+                if abs(L2 - R1) >= 20:
+                    continue
+                LL = min(L1, L2)
+                UU = min(U1, U2)
+                RR = max(R1, R2)
+                DD = max(D1, D2)
+                svg_final_bboxs[i] = (LL, UU, RR, DD)
+                svg_final_visited[j] = True
+                continue
+            # 竖直并列
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
+            if ratio_1 >= 0.7 and ratio_2 >= 0.7:
+                if abs(U2 - D1) >= 20:
+                    continue
+                LL = min(L1, L2)
+                UU = min(U1, U2)
+                RR = max(R1, R2)
+                DD = max(D1, D2)
+                svg_final_bboxs[i] = (LL, UU, RR, DD)
+                svg_final_visited[j] = True
+    
+    for i in range(len(svg_final_bboxs)):
+        if svg_final_visited[i] == False:
+            L, U, R, D = svg_final_bboxs[i]
+            svg_final_bboxs_2.append((L, U, R, D))
+            
+            L -= eps_ERROR * 2
+            U -= eps_ERROR
+            R += eps_ERROR * 2
+            D += eps_ERROR
+            # cur_svg = page.get_pixmap(clip=(L,U,R,D))
+            new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2)      # 图片name
+            # cur_svg.save(res_dir_path + '/' + new_svg_name)        # 把图片存出在新建的文件夹，并命名
+            svg_final_names_2.append(new_svg_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            svg_final_bboxs_2.append((L, U, R, D))
+            svg_final_visited_2.append(False)
+            svg_ID_2 += 1
+       
+    ## svg收尾。识别为drawing，但是在上面没有拼成一张图的。
+    # 有收尾才comprehensive
+    # xxxx
+    # xxxx
+    # xxxx
+    # xxxx
+    
+    
+    #--------- 通过json_from_DocXchain来获取，figure, table, equation的bbox ---------#
+    figure_bbox_from_DocXChain = []
+    
+    figure_from_DocXChain_visited = []          # 记忆化
+    figure_bbox_from_DocXChain_overlappedRatio = []
+    
+    figure_only_from_DocXChain_bboxs = []     # 存储
+    figure_only_from_DocXChain_names = []
+    figure_only_from_DocXChain_visited = []
+    figure_only_ID = 0
+    
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # figure
+        if xf["category_id"] == 1 and xf['score'] >= 0.3:
+            figure_bbox_from_DocXChain.append((L, U, R, D))
+            figure_from_DocXChain_visited.append(False)
+            figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
+
+    #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
+    
+    ## 比对imgs
+    for i, b1 in enumerate(figure_bbox_from_DocXChain):
+        # print('--------- DocXChain的图片', b1)
+        L1, U1, R1, D1 = b1
+        for b2 in img_bboxs:
+            # print('-------- igms得到的图', b2)
+            L2, U2, R2, D2 = b2
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            # 相同
+            if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            # 包含
+            elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                if s2 / s1 > 0.8:
+                    figure_from_DocXChain_visited[i] = True
+            elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                if s1 / s2 > 0.8:
+                    figure_from_DocXChain_visited[i] = True 
+            else:
+                # 重叠了相当一部分
+                # print('进入第3部分')
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
+                    # print('图片的重叠率是{}'.format(ratio_1))
+
+
+    ## 比对svgs
+    svg_final_bboxs_2_badIdxs = []
+    for i, b1 in enumerate(figure_bbox_from_DocXChain):
+        L1, U1, R1, D1 = b1
+        for j, b2 in enumerate(svg_final_bboxs_2):
+            L2, U2, R2, D2 = b2
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            # 相同
+            if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            # 包含
+            elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                if s1 / s2 > 0.7:
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    svg_final_bboxs_2_badIdxs.append(j)     # svg丢弃。用DocXChain的结果。
+            else:
+                # 重叠了相当一部分
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
+                    
+    # 丢掉错误的svg
+    svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
+    
+    for i in range(len(figure_from_DocXChain_visited)):
+        if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
+            figure_from_DocXChain_visited[i] = True
+    
+    # DocXChain识别出来的figure，但是没被保存的。
+    for i in range(len(figure_from_DocXChain_visited)):
+        if figure_from_DocXChain_visited[i] == False:
+            figure_from_DocXChain_visited[i] = True
+            cur_bbox = figure_bbox_from_DocXChain[i]
+            # cur_figure = page.get_pixmap(clip=cur_bbox)
+            new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID)      # 图片name
+            # cur_figure.save(res_dir_path + '/' + new_figure_name)        # 把图片存出在新建的文件夹，并命名
+            figure_only_from_DocXChain_names.append(new_figure_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            figure_only_from_DocXChain_bboxs.append(cur_bbox)
+            figure_only_from_DocXChain_visited.append(False)
+            figure_only_ID += 1
+    
+    img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
+    
+    #--------------------------- 最后统一去重 -----------------------------------#
+    curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
+    
+    #### 先考虑包含关系的小块
+    final_duplicate = set()
+    for i in range(len(curPage_all_fig_bboxs)):
+        L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
+        for j in range(len(curPage_all_fig_bboxs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                final_duplicate.add((L1, U1, R1, D1))
+            else:
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if ratio_1 >= 0.8 and ratio_2 <= 0.6:
+                    final_duplicate.add((L1, U1, R1, D1))
+
+    curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
+    
+    #### 再考虑重叠关系的块
+    final_duplicate = set()
+    final_synthetic_bboxs = []
+    for i in range(len(curPage_all_fig_bboxs)):
+        L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
+        for j in range(len(curPage_all_fig_bboxs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+            union_ok = False
+            if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+                union_ok = True
+            if (ratio_1 > 0.2 and s2 / s1 > 5):
+                union_ok = True
+            if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+                union_ok = True
+            if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+                union_ok = True
+            if union_ok == True:
+                final_duplicate.add((L1, U1, R1, D1))
+                final_duplicate.add((L2, U2, R2, D2))
+                L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
+                final_synthetic_bboxs.append((L3, U3, R3, D3))
+
+    # print('---------- curPage_all_fig_bboxs ---------')
+    # print(curPage_all_fig_bboxs)
+    curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]    
+    final_synthetic_bboxs = list(set(final_synthetic_bboxs))
+
+
+    ## 再再考虑重叠关系。极端情况下会迭代式地2进1
+    new_images = []
+    droped_img_idx = []
+    image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]        
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            if j not in droped_img_idx:
+                L2, U2, R2, D2 = image_bboxes[j]
+                s1 = abs(R1 - L1) * abs(D1 - U1)
+                s2 = abs(R2 - L2) * abs(D2 - U2)
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                union_ok = False
+                if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+                    union_ok = True
+                if (ratio_1 > 0.2 and s2 / s1 > 5):
+                    union_ok = True
+                if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+                    union_ok = True
+                if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+                    union_ok = True
+                if union_ok == True:
+                    # 合并
+                    image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
+                    droped_img_idx.append(j)
+            
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+    
+    
+    # find_union_FLAG = True
+    # while find_union_FLAG == True:
+    #     find_union_FLAG = False
+    #     final_duplicate = set()
+    #     tmp = []
+    #     for i in range(len(final_synthetic_bboxs)):
+    #         L1, U1, R1, D1 = final_synthetic_bboxs[i]
+    #         for j in range(len(final_synthetic_bboxs)):
+    #             if i == j:
+    #                 continue
+    #             L2, U2, R2, D2 = final_synthetic_bboxs[j]
+    #             s1 = abs(R1 - L1) * abs(D1 - U1)
+    #             s2 = abs(R2 - L2) * abs(D2 - U2)
+    #             ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+    #             union_ok = False
+    #             if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+    #                 union_ok = True
+    #             if (ratio_1 > 0.2 and s2 / s1 > 5):
+    #                 union_ok = True
+    #             if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+    #                 union_ok = True
+    #             if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+    #                 union_ok = True
+    #             if union_ok == True:
+    #                 find_union_FLAG = True
+    #                 final_duplicate.add((L1, U1, R1, D1))
+    #                 final_duplicate.add((L2, U2, R2, D2))
+    #                 L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
+    #                 tmp.append((L3, U3, R3, D3)) 
+    #     if find_union_FLAG == True:
+    #         tmp = list(set(tmp))
+    #         final_synthetic_bboxs = tmp[:]
+    
+
+    # curPage_all_fig_bboxs += final_synthetic_bboxs
+    # print('--------- final synthetic')
+    # print(final_synthetic_bboxs)
+    #**************************************************************************#
+    images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
+    images = images1 + new_images
+    return images
+
--- a/magic_pdf/pre_proc/detect_page_number.py
+++ b/magic_pdf/pre_proc/detect_page_number.py
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 pageNo ---------#
+    pageNo_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 4 and xf['score'] >= 0.3:
+            pageNo_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    pageNo_final_names = []
+    pageNo_final_bboxs = []
+    pageNo_ID = 0
+    for L, U, R, D in pageNo_bbox_from_DocXChain:
+        # cur_pageNo = page.get_pixmap(clip=(L,U,R,D))
+        new_pageNo_name = "pageNo_{}_{}.png".format(page_ID, pageNo_ID)    # 页码name
+        # cur_pageNo.save(res_dir_path + '/' + new_pageNo_name)           # 把页码存储在新建的文件夹，并命名
+        pageNo_final_names.append(new_pageNo_name)                        # 把页码的名字存在list中
+        pageNo_final_bboxs.append((L, U, R, D))
+        pageNo_ID += 1
+        
+
+    pageNo_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_pageNo_bboxs = pageNo_final_bboxs
+    return curPage_all_pageNo_bboxs
+
--- a/magic_pdf/pre_proc/detect_tables.py
+++ b/magic_pdf/pre_proc/detect_tables.py
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+
+
+def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    
+
+    #--------- 通过json_from_DocXchain来获取 table ---------#
+    table_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+
+    
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 7 and xf['score'] >= 0.3:
+            table_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    table_final_names = []
+    table_final_bboxs = []
+    table_ID = 0
+    for L, U, R, D in table_bbox_from_DocXChain:
+        # cur_table = page.get_pixmap(clip=(L,U,R,D))
+        new_table_name = "table_{}_{}.png".format(page_ID, table_ID)      # 表格name
+        # cur_table.save(res_dir_path + '/' + new_table_name)        # 把表格存出在新建的文件夹，并命名
+        table_final_names.append(new_table_name)                      # 把表格的名字存在list中，方便在md中插入引用
+        table_final_bboxs.append((L, U, R, D))
+        table_ID += 1
+        
+
+    table_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_table_bboxs = table_final_bboxs
+    return curPage_all_table_bboxs
+
--- a/magic_pdf/pre_proc/equations_replace.py
+++ b/magic_pdf/pre_proc/equations_replace.py
+"""
+对pymupdf返回的结构里的公式进行替换，替换为模型识别的公式结果
+"""
+
+from magic_pdf.libs.commons import fitz
+import json
+import os
+from pathlib import Path
+from loguru import logger
+from magic_pdf.libs.ocr_content_type import ContentType
+
+TYPE_INLINE_EQUATION = ContentType.InlineEquation
+TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
+
+
+def combine_chars_to_pymudict(block_dict, char_dict):
+    """
+    把block级别的pymupdf 结构里加入char结构
+    """
+    # 因为block_dict 被裁剪过，因此先把他和char_dict文字块对齐，才能进行补充
+    char_map = {tuple(item["bbox"]): item for item in char_dict}
+
+    for i in range(len(block_dict)):  # blcok
+        block = block_dict[i]
+        key = block["bbox"]
+        char_dict_item = char_map[tuple(key)]
+        char_dict_map = {tuple(item["bbox"]): item for item in char_dict_item["lines"]}
+        for j in range(len(block["lines"])):
+            lines = block["lines"][j]
+            with_char_lines = char_dict_map[lines["bbox"]]
+            for k in range(len(lines["spans"])):
+                spans = lines["spans"][k]
+                try:
+                    chars = with_char_lines["spans"][k]["chars"]
+                except Exception as e:
+                    logger.error(char_dict[i]["lines"][j])
+
+                spans["chars"] = chars
+
+    return block_dict
+
+
+def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
+    """
+    计算box1和box2的重叠面积占最小面积的box的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], min_bbox[0])
+    y_top = max(bbox1[1], min_bbox[1])
+    x_right = min(bbox1[2], min_bbox[2])
+    y_bottom = min(bbox1[3], min_bbox[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    min_box_area = (min_bbox[3] - min_bbox[1]) * (min_bbox[2] - min_bbox[0])
+    if min_box_area == 0:
+        return 0
+    else:
+        return intersection_area / min_box_area
+
+
+def _is_xin(bbox1, bbox2):
+    area1 = abs(bbox1[2] - bbox1[0]) * abs(bbox1[3] - bbox1[1])
+    area2 = abs(bbox2[2] - bbox2[0]) * abs(bbox2[3] - bbox2[1])
+    if area1 < area2:
+        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1)
+    else:
+        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+
+    return ratio > 0.6
+
+
+def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
+    """消除掉整个块都在行间公式块内部的文本块"""
+    for eq_bbox in interline_bboxes:
+        removed_txt_blk = []
+        for text_blk in text_blocks:
+            text_bbox = text_blk["bbox"]
+            if (
+                calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], text_bbox)
+                >= 0.7
+            ):
+                removed_txt_blk.append(text_blk)
+        for blk in removed_txt_blk:
+            text_blocks.remove(blk)
+
+    return text_blocks
+
+
+def _is_in_or_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠或者包含
+    """
+    if box1 is None or box2 is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    return not (
+        x1_1 < x0_2  # box1在box2的左边
+        or x0_1 > x1_2  # box1在box2的右边
+        or y1_1 < y0_2  # box1在box2的上边
+        or y0_1 > y1_2
+    )  # box1在box2的下边
+
+
+def remove_text_block_overlap_interline_equation_bbox(
+    interline_eq_bboxes, pymu_block_list
+):
+
+    """消除掉行行内公式有部分重叠的文本块的内容。
+    同时重新计算消除重叠之后文本块的大小"""
+    deleted_block = []
+    for text_block in pymu_block_list:
+        deleted_line = []
+        for line in text_block["lines"]:
+            deleted_span = []
+            for span in line["spans"]:
+                deleted_chars = []
+                for char in span["chars"]:
+                    if any(
+                            [
+                                (calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
+                                for eq_bbox in interline_eq_bboxes
+                            ]
+                    ):
+                        deleted_chars.append(char)
+                # 检查span里没有char则删除这个span
+                for char in deleted_chars:
+                    span["chars"].remove(char)
+                # 重新计算这个span的大小
+                if len(span["chars"]) == 0:  # 删除这个span
+                    deleted_span.append(span)
+                else:
+                    span["bbox"] = (
+                        min([b["bbox"][0] for b in span["chars"]]),
+                        min([b["bbox"][1] for b in span["chars"]]),
+                        max([b["bbox"][2] for b in span["chars"]]),
+                        max([b["bbox"][3] for b in span["chars"]]),
+                    )
+
+            # 检查这个span
+            for span in deleted_span:
+                line["spans"].remove(span)
+            if len(line["spans"]) == 0:  # 删除这个line
+                deleted_line.append(line)
+            else:
+                line["bbox"] = (
+                    min([b["bbox"][0] for b in line["spans"]]),
+                    min([b["bbox"][1] for b in line["spans"]]),
+                    max([b["bbox"][2] for b in line["spans"]]),
+                    max([b["bbox"][3] for b in line["spans"]]),
+                )
+
+        # 检查这个block是否可以删除
+        for line in deleted_line:
+            text_block["lines"].remove(line)
+        if len(text_block["lines"]) == 0:  # 删除block
+            deleted_block.append(text_block)
+        else:
+            text_block["bbox"] = (
+                min([b["bbox"][0] for b in text_block["lines"]]),
+                min([b["bbox"][1] for b in text_block["lines"]]),
+                max([b["bbox"][2] for b in text_block["lines"]]),
+                max([b["bbox"][3] for b in text_block["lines"]]),
+            )
+
+    # 检查text block删除
+    for block in deleted_block:
+        pymu_block_list.remove(block)
+    if len(pymu_block_list) == 0:
+        return []
+
+    return pymu_block_list
+
+
+def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
+    """在行间公式对应的地方插上一个伪造的block"""
+    for eq in interline_eq_bboxes:
+        bbox = eq["bbox"]
+        latex_content = eq["latex"]
+        text_block = {
+            "number": len(pymu_block_list),
+            "type": 0,
+            "bbox": bbox,
+            "lines": [
+                {
+                    "spans": [
+                        {
+                            "size": 9.962599754333496,
+                            "type": TYPE_INTERLINE_EQUATION,
+                            "flags": 4,
+                            "font": TYPE_INTERLINE_EQUATION,
+                            "color": 0,
+                            "ascender": 0.9409999847412109,
+                            "descender": -0.3050000071525574,
+                            "latex": latex_content,
+                            "origin": [bbox[0], bbox[1]],
+                            "bbox": bbox,
+                        }
+                    ],
+                    "wmode": 0,
+                    "dir": [1.0, 0.0],
+                    "bbox": bbox,
+                }
+            ],
+        }
+        pymu_block_list.append(text_block)
+
+
+def x_overlap_ratio(box1, box2):
+    a, _, c, _ = box1
+    e, _, g, _ = box2
+
+    # 计算重叠宽度
+    overlap_x = max(min(c, g) - max(a, e), 0)
+
+    # 计算box1的宽度
+    width1 = g - e
+
+    # 计算重叠比例
+    overlap_ratio = overlap_x / width1 if width1 != 0 else 0
+
+    return overlap_ratio
+
+
+def __is_x_dir_overlap(bbox1, bbox2):
+    return not (bbox1[2] < bbox2[0] or bbox1[0] > bbox2[2])
+
+
+def __y_overlap_ratio(box1, box2):
+    """"""
+    _, b, _, d = box1
+    _, f, _, h = box2
+
+    # 计算重叠高度
+    overlap_y = max(min(d, h) - max(b, f), 0)
+
+    # 计算box1的高度
+    height1 = d - b
+
+    # 计算重叠比例
+    overlap_ratio = overlap_y / height1 if height1 != 0 else 0
+
+    return overlap_ratio
+
+
+def replace_line_v2(eqinfo, line):
+    """
+    扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
+    最后与这个x0,x1有相交的span0, span1内部进行分割。
+    """
+    first_overlap_span = -1
+    first_overlap_span_idx = -1
+    last_overlap_span = -1
+    delete_chars = []
+    for i in range(0, len(line["spans"])):
+        if "chars" not in line["spans"][i]:
+            continue
+
+        if line["spans"][i].get("_type", None) is not None:
+            continue  # 忽略，因为已经是插入的伪造span公式了
+
+        for char in line["spans"][i]["chars"]:
+            if __is_x_dir_overlap(eqinfo["bbox"], char["bbox"]):
+                line_txt = ""
+                for span in line["spans"]:
+                    span_txt = "<span>"
+                    for ch in span["chars"]:
+                        span_txt = span_txt + ch["c"]
+
+                    span_txt = span_txt + "</span>"
+
+                    line_txt = line_txt + span_txt
+
+                if first_overlap_span_idx == -1:
+                    first_overlap_span = line["spans"][i]
+                    first_overlap_span_idx = i
+                last_overlap_span = line["spans"][i]
+                delete_chars.append(char)
+
+    # 第一个和最后一个char要进行检查，到底属于公式多还是属于正常span多
+    if len(delete_chars) > 0:
+        ch0_bbox = delete_chars[0]["bbox"]
+        if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
+            delete_chars.remove(delete_chars[0])
+    if len(delete_chars) > 0:
+        ch0_bbox = delete_chars[-1]["bbox"]
+        if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
+            delete_chars.remove(delete_chars[-1])
+
+    # 计算x方向上被删除区间内的char的真实x0, x1
+    if len(delete_chars):
+        x0, x1 = min([b["bbox"][0] for b in delete_chars]), max(
+            [b["bbox"][2] for b in delete_chars]
+        )
+    else:
+        # logger.debug(f"行内公式替换没有发生，尝试下一行匹配, eqinfo={eqinfo}")
+        return False
+
+    # 删除位于x0, x1这两个中间的span
+    delete_span = []
+    for span in line["spans"]:
+        span_box = span["bbox"]
+        if x0 <= span_box[0] and span_box[2] <= x1:
+            delete_span.append(span)
+    for span in delete_span:
+        line["spans"].remove(span)
+
+    equation_span = {
+        "size": 9.962599754333496,
+        "type": TYPE_INLINE_EQUATION,
+        "flags": 4,
+        "font": TYPE_INLINE_EQUATION,
+        "color": 0,
+        "ascender": 0.9409999847412109,
+        "descender": -0.3050000071525574,
+        "latex": "",
+        "origin": [337.1410153102337, 216.0205245153934],
+        "bbox": eqinfo["bbox"]
+    }
+    # equation_span = line['spans'][0].copy()
+    equation_span["latex"] = eqinfo['latex']
+    equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]]
+    equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]]
+    equation_span["chars"] = delete_chars
+    equation_span["type"] = TYPE_INLINE_EQUATION
+    equation_span["_eq_bbox"] = eqinfo["bbox"]
+    line["spans"].insert(first_overlap_span_idx + 1, equation_span)  # 放入公式
+
+    # logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
+
+    # 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
+    first_span_chars = [
+        char
+        for char in first_overlap_span["chars"]
+        if (char["bbox"][2] + char["bbox"][0]) / 2 < x0
+    ]
+    tail_span_chars = [
+        char
+        for char in last_overlap_span["chars"]
+        if (char["bbox"][0] + char["bbox"][2]) / 2 > x1
+    ]
+
+    if len(first_span_chars) > 0:
+        first_overlap_span["chars"] = first_span_chars
+        first_overlap_span["text"] = "".join([char["c"] for char in first_span_chars])
+        first_overlap_span["bbox"] = (
+            first_overlap_span["bbox"][0],
+            first_overlap_span["bbox"][1],
+            max([chr["bbox"][2] for chr in first_span_chars]),
+            first_overlap_span["bbox"][3],
+        )
+        # first_overlap_span['_type'] = "first"
+    else:
+        # 删掉
+        if first_overlap_span not in delete_span:
+            line["spans"].remove(first_overlap_span)
+
+    if len(tail_span_chars) > 0:
+        min_of_tail_span_x0 = min([chr["bbox"][0] for chr in tail_span_chars])
+        min_of_tail_span_y0 = min([chr["bbox"][1] for chr in tail_span_chars])
+        max_of_tail_span_x1 = max([chr["bbox"][2] for chr in tail_span_chars])
+        max_of_tail_span_y1 = max([chr["bbox"][3] for chr in tail_span_chars])
+
+        if last_overlap_span == first_overlap_span:  # 这个时候应该插入一个新的
+            tail_span_txt = "".join([char["c"] for char in tail_span_chars])
+            last_span_to_insert = last_overlap_span.copy()
+            last_span_to_insert["chars"] = tail_span_chars
+            last_span_to_insert["text"] = "".join(
+                [char["c"] for char in tail_span_chars]
+            )
+            if equation_span["bbox"][2] >= last_overlap_span["bbox"][2]:
+                last_span_to_insert["bbox"] = (
+                    min_of_tail_span_x0,
+                    min_of_tail_span_y0,
+                    max_of_tail_span_x1,
+                    max_of_tail_span_y1
+                )
+            else:
+                last_span_to_insert["bbox"] = (
+                    min([chr["bbox"][0] for chr in tail_span_chars]),
+                    last_overlap_span["bbox"][1],
+                    last_overlap_span["bbox"][2],
+                    last_overlap_span["bbox"][3],
+                )
+            # 插入到公式对象之后
+            equation_idx = line["spans"].index(equation_span)
+            line["spans"].insert(equation_idx + 1, last_span_to_insert)  # 放入公式
+        else:  # 直接修改原来的span
+            last_overlap_span["chars"] = tail_span_chars
+            last_overlap_span["text"] = "".join([char["c"] for char in tail_span_chars])
+            last_overlap_span["bbox"] = (
+                min([chr["bbox"][0] for chr in tail_span_chars]),
+                last_overlap_span["bbox"][1],
+                last_overlap_span["bbox"][2],
+                last_overlap_span["bbox"][3],
+            )
+    else:
+        # 删掉
+        if (
+            last_overlap_span not in delete_span
+            and last_overlap_span != first_overlap_span
+        ):
+            line["spans"].remove(last_overlap_span)
+
+    remain_txt = ""
+    for span in line["spans"]:
+        span_txt = "<span>"
+        for char in span["chars"]:
+            span_txt = span_txt + char["c"]
+
+        span_txt = span_txt + "</span>"
+
+        remain_txt = remain_txt + span_txt
+
+    # logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】")
+
+    return True
+
+
+def replace_eq_blk(eqinfo, text_block):
+    """替换行内公式"""
+    for line in text_block["lines"]:
+        line_bbox = line["bbox"]
+        if (
+            _is_xin(eqinfo["bbox"], line_bbox)
+            or __y_overlap_ratio(eqinfo["bbox"], line_bbox) > 0.6
+        ):  # 定位到行, 使用y方向重合率是因为有的时候，一个行的宽度会小于公式位置宽度：行很高，公式很窄，
+            replace_succ = replace_line_v2(eqinfo, line)
+            if (
+                not replace_succ
+            ):  # 有的时候，一个pdf的line高度从API里会计算的有问题，因此在行内span级别会替换不成功，这就需要继续重试下一行
+                continue
+            else:
+                break
+    else:
+        return False
+    return True
+
+
+def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
+    """替换行内公式"""
+    for eqinfo in inline_equation_bboxes:
+        eqbox = eqinfo["bbox"]
+        for blk in raw_text_blocks:
+            if _is_xin(eqbox, blk["bbox"]):
+                if not replace_eq_blk(eqinfo, blk):
+                    logger.warning(f"行内公式没有替换成功：{eqinfo} ")
+                else:
+                    break
+
+    return raw_text_blocks
+
+
+def remove_chars_in_text_blocks(text_blocks):
+    """删除text_blocks里的char"""
+    for blk in text_blocks:
+        for line in blk["lines"]:
+            for span in line["spans"]:
+                _ = span.pop("chars", "no such key")
+    return text_blocks
+
+
+def replace_equations_in_textblock(
+    raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes
+):
+    """
+    替换行间和和行内公式为latex
+    """
+    raw_text_blocks = remove_text_block_in_interline_equation_bbox(
+        interline_equation_bboxes, raw_text_blocks
+    )  # 消除重叠：第一步，在公式内部的
+
+    raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
+        interline_equation_bboxes, raw_text_blocks
+    )  # 消重，第二步，和公式覆盖的
+
+    insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
+    raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
+    return raw_text_blocks
+
+
+def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
+    """ """
+    new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf"
+    with open(json_path, "r", encoding="utf-8") as f:
+        obj = json.loads(f.read())
+
+    if os.path.exists(new_pdf):
+        os.remove(new_pdf)
+    new_doc = fitz.open("")
+
+    doc = fitz.open(pdf_path)
+    new_doc = fitz.open(pdf_path)
+    for i in range(len(new_doc)):
+        page = new_doc[i]
+        inline_equation_bboxes = obj[f"page_{i}"]["inline_equations"]
+        interline_equation_bboxes = obj[f"page_{i}"]["interline_equations"]
+        raw_text_blocks = obj[f"page_{i}"]["preproc_blocks"]
+        raw_text_blocks = remove_text_block_in_interline_equation_bbox(
+            interline_equation_bboxes, raw_text_blocks
+        )  # 消除重叠：第一步，在公式内部的
+        raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
+            interline_equation_bboxes, raw_text_blocks
+        )  # 消重，第二步，和公式覆盖的
+        insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
+        raw_text_blocks = replace_inline_equations(
+            inline_equation_bboxes, raw_text_blocks
+        )
+
+        # 为了检验公式是否重复，把每一行里，含有公式的span背景改成黄色的
+        color_map = [fitz.pdfcolor["blue"], fitz.pdfcolor["green"]]
+        j = 0
+        for blk in raw_text_blocks:
+            for i, line in enumerate(blk["lines"]):
+
+                # line_box = line['bbox']
+                # shape = page.new_shape()
+                # shape.draw_rect(line_box)
+                # shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3)
+                # shape.commit()
+                # j = j+1
+
+                for i, span in enumerate(line["spans"]):
+                    shape_page = page.new_shape()
+                    span_type = span.get("_type")
+                    color = fitz.pdfcolor["blue"]
+                    if span_type == "first":
+                        color = fitz.pdfcolor["blue"]
+                    elif span_type == "tail":
+                        color = fitz.pdfcolor["green"]
+                    elif span_type == TYPE_INLINE_EQUATION:
+                        color = fitz.pdfcolor["black"]
+                    else:
+                        color = None
+
+                    b = span["bbox"]
+                    shape_page.draw_rect(b)
+
+                    shape_page.finish(color=None, fill=color, fill_opacity=0.3)
+                    shape_page.commit()
+
+    new_doc.save(new_pdf)
+    logger.info(f"save ok {new_pdf}")
+    final_json = json.dumps(obj, ensure_ascii=False, indent=2)
+    with open("equations_test/final_json.json", "w") as f:
+        f.write(final_json)
+
+    return new_pdf
+
+
+if __name__ == "__main__":
+    # draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
+    pass
--- a/magic_pdf/pre_proc/fix_image.py
+++ b/magic_pdf/pre_proc/fix_image.py
+
+
+
+import re    
+from magic_pdf.libs.boxbase import  _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
+
+from magic_pdf.libs.textbase import get_text_block_base_info
+
+def fix_image_vertical(image_bboxes:list, text_blocks:list):
+    """
+    修正图片的位置
+    如果图片与文字block发生一定重叠（也就是图片切到了一部分文字），那么减少图片边缘，让文字和图片不再重叠。
+    只对垂直方向进行。
+    """
+    for image_bbox in image_bboxes:
+        for text_block in text_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
+                if text_bbox[1] < image_bbox[1]:#在图片上方
+                    image_bbox[1] = text_bbox[3]+1
+                elif text_bbox[3]>image_bbox[3]:#在图片下方
+                    image_bbox[3] = text_bbox[1]-1
+                
+    return image_bboxes
+
+def __merge_if_common_edge(bbox1, bbox2):
+    x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
+    x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
+
+    # 检查是否有公共的水平边
+    if y_min_1 == y_min_2 or y_max_1 == y_max_2:
+        # 确保一个框的x范围在另一个框的x范围内
+        if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+
+    # 检查是否有公共的垂直边
+    if x_min_1 == x_min_2 or x_max_1 == x_max_2:
+        # 确保一个框的y范围在另一个框的y范围内
+        if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+
+    # 如果没有公共边
+    return None
+
+def fix_seperated_image(image_bboxes:list):
+    """
+    如果2个图片有一个边重叠，那么合并2个图片
+    """
+    new_images = []
+    droped_img_idx = []
+            
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
+            if new_img is not None:
+                new_images.append(new_img)
+                droped_img_idx.append(i)
+                droped_img_idx.append(j)
+                break
+            
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+            
+    return new_images
+
+
+def __check_img_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r"^(fig|figure).*", r"^(scheme).*"]
+    text = text.strip()
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+    return False
+
+def __get_fig_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+
+
+def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
+    """
+    继续向下方寻找和图片caption字号，字体，颜色一样的文字框，合并入caption。
+    text_block是已经找到的图片catpion（这个caption可能不全，多行被划分到多个pymu block里了）
+    """
+    combined_image_caption_text_block = list(text_block.copy()['bbox'])
+    base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
+    while True:
+        tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
+        if not tb_add:
+            break
+        tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
+        if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
+            combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
+            combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
+            combined_image_caption_text_block[3] = tb_add['bbox'][3]
+        else:
+            break
+            
+    image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
+    image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
+    image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
+    image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
+    text_block['_image_caption'] = True
+        
+
+def include_img_title(pymu_blocks, image_bboxes: list):
+    """
+    向上方和下方寻找符合图片title的文本block，合并到图片里
+    如果图片上下都有fig的情况怎么办？寻找标题距离最近的那个。
+    ---
+    增加对左侧和右侧图片标题的寻找
+    """
+
+    
+    for tb in image_bboxes:
+        # 优先找下方的
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_btn:
+                txt, line_cnt = __get_fig_caption_text(text_block_btn)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题，或者有时候图片下方文字没有被图片识别模型放入图片里
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_btn['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_btn['bbox'][3] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_fig_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    b = text_block_top['bbox']
+                    temp_box[1] = b[1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        
+        if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
+            btn_text, _ = __get_fig_caption_text(text_block_btn)
+            top_text, _ = __get_fig_caption_text(text_block_top)
+            if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
+                # 取距离图片最近的
+                btn_text_distance = text_block_btn['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                if btn_text_distance<top_text_distance: # caption在下方
+                    __find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
+                else:
+                    text_block = text_block_top
+                    tb[0] = min(tb[0], text_block['bbox'][0])
+                    tb[1] = min(tb[1], text_block['bbox'][1])
+                    tb[2] = max(tb[2], text_block['bbox'][2])
+                    tb[3] = max(tb[3], text_block['bbox'][3])
+                    text_block_btn['_image_caption'] = True
+                continue
+            
+        text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
+        if text_block and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                # 发现特征之后，继续向相同方向寻找（想同颜色，想同大小，想同字体）的textblock
+                __find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
+                continue
+            
+        text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
+        if text_block  and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_image_caption'] = True
+                continue
+            
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(left_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+            
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(right_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+
+    return image_bboxes
+
+
+def combine_images(image_bboxes:list):
+    """
+    合并图片，如果图片有重叠，那么合并
+    """
+    new_images = []
+    droped_img_idx = []
+            
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
+                # 合并
+                image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
+                droped_img_idx.append(j)
+            
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+            
+    return new_images
\ No newline at end of file
--- a/magic_pdf/pre_proc/fix_table.py
+++ b/magic_pdf/pre_proc/fix_table.py
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+import re
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json
+
+
+## version 2
+def get_merged_line(page):
+    """
+    这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线，并且将断开的线段进行了合并。
+    :param page :fitz读取的当前页的内容
+    """
+    drawings_bbox = []
+    drawings_line = []
+    drawings = page.get_drawings()  # 提取所有的矢量
+    for p in drawings:
+        drawings_bbox.append(p["rect"].irect)  # (L, U, R, D)
+
+    lines = []
+    for L, U, R, D in drawings_bbox:
+        if abs(D - U) <= 3: # 筛出水平的横线
+            lines.append((L, U, R, D))
+    U_groups = []
+    visited = [False for _ in range(len(lines))]
+    for i, (L1, U1, R1, D1) in enumerate(lines):
+        if visited[i] == True:
+            continue
+        tmp_g = [(L1, U1, R1, D1)]
+        for j, (L2, U2, R2, D2) in enumerate(lines):
+            if i == j:
+                continue
+            if visited[j] == True:
+                continue
+            if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5:   # 把高度一致的线放进一个group
+                tmp_g.append((L2, U2, R2, D2))
+                visited[j] = True
+        U_groups.append(tmp_g)
+        
+    res = []
+    for group in U_groups:
+        group.sort(key = lambda LURD: (LURD[0], LURD[2]))
+        LL, UU, RR, DD = group[0]
+        for i, (L1, U1, R1, D1) in enumerate(group):
+            if (L1 - RR) >= 5:
+                cur_line = (LL, UU, RR, DD)
+                res.append(cur_line)
+                LL = L1
+            else:
+                RR = max(RR, R1)
+        cur_line = (LL, UU, RR, DD)
+        res.append(cur_line)
+    return res
+
+def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
+    """
+    :param page :fitz读取的当前页的内容
+    :param table_bboxes: list类型，每一个元素是一个元祖 (L, U, R, D)
+    :param include_table_title: 是否将表格的标题也圈进来
+    :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
+    """
+    
+    drawings_lines = get_merged_line(page)
+    fix_table_bboxes = []
+    
+    for table in table_bboxes:
+        (L, U, R, D) = table
+        fix_table_L = []
+        fix_table_U = []
+        fix_table_R = []
+        fix_table_D = []
+        width = R - L
+        width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
+        height = D - U
+        height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
+        for line in drawings_lines:
+            if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
+                if (U - height_range) < line[1] < (U + height_range): # 上边界，在一定的高度范围内
+                    fix_table_U.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+                elif (D - height_range) < line[1] < (D + height_range): # 下边界，在一定的高度范围内
+                    fix_table_D.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+
+        if fix_table_U:
+            U = min(fix_table_U)
+        if fix_table_D:
+            D = max(fix_table_D)
+        if fix_table_L:
+            L = min(fix_table_L)
+        if fix_table_R:
+            R = max(fix_table_R)
+            
+        if include_table_title:   # 需要将表格标题包括
+            text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]   # 所有的text的block
+            incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))]  # 将与表格完全没有任何遮挡的文字筛除掉（比如另一栏的文字）
+            upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0]  # 将在表格线以上的text block筛选出来
+            sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序，如果是同一个高度，则先左再右
+            
+            for idx in range(scan_line_num):   
+                if idx+1 <= len(sorted_filtered_text_blocks):
+                    line_temp = sorted_filtered_text_blocks[idx]['lines']
+                    if line_temp:
+                        text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
+                        check_en = re.match('Table', text) # 检查是否有Table开头的(英文）
+                        check_ch = re.match('表', text) # 检查是否有Table开头的(中文）
+                        if check_en or check_ch:
+                            if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
+                                U = sorted_filtered_text_blocks[idx]['bbox'][1]
+                                  
+        fix_table_bboxes.append([L-2, U-2, R+2, D+2])
+    
+    return fix_table_bboxes
+
+def __check_table_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r'^table\s\d+']
+    
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+        else:
+            return False
+         
+         
+def fix_table_text_block(pymu_blocks, table_bboxes: list):
+    """
+    调整table, 如果table和上下的text block有相交区域，则将table的上下边界调整到text block的上下边界
+    例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
+    """
+    for tb in table_bboxes:
+        (L, U, R, D) = tb
+        for block in pymu_blocks:
+            if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
+                txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
+                if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title，那么不调整。因为下一步会统一调整，如果这里进行了调整，后面的调整会造成调整到其他table的title上（在连续出现2个table的情况下）。
+                    tb[0] = min(tb[0], block['bbox'][0])
+                    tb[1] = min(tb[1], block['bbox'][1])
+                    tb[2] = max(tb[2], block['bbox'][2])
+                    tb[3] = max(tb[3], block['bbox'][3])
+                    block['_table'] = True # 占位，防止其他table再次占用
+                    
+                """如果是个table的title，但是有部分重叠，那么修正这个title,使得和table不重叠"""
+                if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
+                    block['bbox'] = list(block['bbox'])
+                    if block['bbox'][3] > U:
+                        block['bbox'][3] = U-1
+                    if block['bbox'][1] < D:
+                        block['bbox'][1] = D+1
+                
+                
+    return table_bboxes
+
+
+def __get_table_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+
+
+def include_table_title(pymu_blocks, table_bboxes: list):
+    """
+    把表格的title也包含进来，扩展到table_bbox上
+    """
+    for tb in table_bboxes:
+        max_find_cnt = 3 # 上上最多找3次
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_table_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt -1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[1] = text_block_top['bbox'][1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+            
+        max_find_cnt = 3 # 向下找
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_bottom:
+                txt, line_cnt = __get_table_caption_text(text_block_bottom)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_bottom['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_bottom['bbox'][3]
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        
+        if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
+            btn_text, _ = __get_table_caption_text(text_block_bottom)
+            top_text, _ = __get_table_caption_text(text_block_top)
+            if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
+                # 取距离最近的
+                btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block_bottom['_table_caption'] = True
+                continue
+
+        # 如果以上条件都不满足，那么就向下找
+        text_block = text_block_top
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+            
+        text_block = text_block_bottom
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(text_block)
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+        
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(left_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+            
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(right_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+                
+    return table_bboxes
\ No newline at end of file
--- a/magic_pdf/pre_proc/main_text_font.py
+++ b/magic_pdf/pre_proc/main_text_font.py
+import collections
+
+
+def get_main_text_font(pdf_docs):
+    font_names = collections.Counter()
+    for page in pdf_docs:
+        blocks = page.get_text('dict')['blocks']
+        if blocks is not None:
+            for block in blocks:
+                lines = block.get('lines')
+                if lines is not None:
+                    for line in lines:
+                        span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
+                                     'font' in span and len(span['text']) > 0]
+                        if span_font:
+                            # main_text_font应该用基于字数最多的字体而不是span级别的统计
+                            # font_names.append(font_name for font_name in span_font)
+                            # block_fonts.append(font_name for font_name in span_font)
+                            for font, count in span_font:
+                                font_names[font] += count
+    main_text_font = font_names.most_common(1)[0][0]
+    return main_text_font
+
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+from loguru import logger
+
+from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
+    calculate_iou, calculate_vertical_projection_overlap_ratio
+from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.ocr_content_type import BlockType
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
+
+
+def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
+                                        title_blocks, interline_equation_blocks, page_w, page_h):
+    all_bboxes = []
+    all_discarded_blocks = []
+    for image in img_blocks:
+        x0, y0, x1, y1 = image['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
+
+    for table in table_blocks:
+        x0, y0, x1, y1 = table['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
+
+    for text in text_blocks:
+        x0, y0, x1, y1 = text['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
+
+    for title in title_blocks:
+        x0, y0, x1, y1 = title['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
+
+    for interline_equation in interline_equation_blocks:
+        x0, y0, x1, y1 = interline_equation['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
+
+    '''block嵌套问题解决'''
+    '''文本框与标题框重叠，优先信任文本框'''
+    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
+    '''任何框体与舍弃框重叠，优先信任舍弃框'''
+    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+
+    # interline_equation 与title或text框冲突的情况，分两种情况处理
+    '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
+    '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
+    # 通过后续大框套小框逻辑删除
+
+    '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
+    for discarded in discarded_blocks:
+        x0, y0, x1, y1 = discarded['bbox']
+        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
+        # 将footnote加入到all_bboxes中，用来计算layout
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
+
+    '''经过以上处理后，还存在大框套小框的情况，则删除小框'''
+    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
+    '''将剩余的bbox做分离处理，防止后面分layout时出错'''
+    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+
+    return all_bboxes, all_discarded_blocks, drop_reasons
+
+
+def add_bboxes(blocks, block_type, bboxes):
+    for block in blocks:
+        x0, y0, x1, y1 = block['bbox']
+        if block_type in [
+            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
+            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
+        ]:
+            bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
+        else:
+            bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
+
+
+def ocr_prepare_bboxes_for_layout_split_v2(
+        img_body_blocks, img_caption_blocks, img_footnote_blocks,
+        table_body_blocks, table_caption_blocks, table_footnote_blocks,
+        discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
+):
+    all_bboxes = []
+
+    add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
+    add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
+    add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
+    add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
+    add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
+    add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
+    add_bboxes(text_blocks, BlockType.Text, all_bboxes)
+    add_bboxes(title_blocks, BlockType.Title, all_bboxes)
+    add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
+
+    '''block嵌套问题解决'''
+    '''文本框与标题框重叠，优先信任文本框'''
+    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
+    '''任何框体与舍弃框重叠，优先信任舍弃框'''
+    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+
+    # interline_equation 与title或text框冲突的情况，分两种情况处理
+    '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
+    '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
+    # 通过后续大框套小框逻辑删除
+
+    '''discarded_blocks'''
+    all_discarded_blocks = []
+    add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
+
+    '''footnote识别：宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的'''
+    footnote_blocks = []
+    for discarded in discarded_blocks:
+        x0, y0, x1, y1 = discarded['bbox']
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+            footnote_blocks.append([x0, y0, x1, y1])
+
+    '''移除在footnote下面的任何框'''
+    need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
+    if len(need_remove_blocks) > 0:
+        for block in need_remove_blocks:
+            all_bboxes.remove(block)
+            all_discarded_blocks.append(block)
+
+    '''经过以上处理后，还存在大框套小框的情况，则删除小框'''
+    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
+    '''将剩余的bbox做分离处理，防止后面分layout时出错'''
+    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+
+    return all_bboxes, all_discarded_blocks
+
+
+def find_blocks_under_footnote(all_bboxes, footnote_blocks):
+    need_remove_blocks = []
+    for block in all_bboxes:
+        block_x0, block_y0, block_x1, block_y1 = block[:4]
+        for footnote_bbox in footnote_blocks:
+            footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
+            # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
+            if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
+                if block not in need_remove_blocks:
+                    need_remove_blocks.append(block)
+                    break
+    return need_remove_blocks
+
+
+def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
+    # 先提取所有text和interline block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Text:
+            text_blocks.append(block)
+    interline_equation_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.InterlineEquation:
+            interline_equation_blocks.append(block)
+
+    need_remove = []
+
+    for interline_equation_block in interline_equation_blocks:
+        for text_block in text_blocks:
+            interline_equation_block_bbox = interline_equation_block[:4]
+            text_block_bbox = text_block[:4]
+            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
+                if text_block not in need_remove:
+                    need_remove.append(text_block)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes
+
+
+def fix_text_overlap_title_blocks(all_bboxes):
+    # 先提取所有text和title block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Text:
+            text_blocks.append(block)
+    title_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Title:
+            title_blocks.append(block)
+
+    need_remove = []
+
+    for text_block in text_blocks:
+        for title_block in title_blocks:
+            text_block_bbox = text_block[:4]
+            title_block_bbox = title_block[:4]
+            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
+                if title_block not in need_remove:
+                    need_remove.append(title_block)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes
+
+
+def remove_need_drop_blocks(all_bboxes, discarded_blocks):
+    need_remove = []
+    for block in all_bboxes:
+        for discarded_block in discarded_blocks:
+            block_bbox = block[:4]
+            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
+                if block not in need_remove:
+                    need_remove.append(block)
+                    break
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+    return all_bboxes
+
+
+def remove_overlaps_min_blocks(all_bboxes):
+    #  重叠block，小的不能直接删除，需要和大的那个合并成一个更大的。
+    #  删除重叠blocks中较小的那些
+    need_remove = []
+    for block1 in all_bboxes:
+        for block2 in all_bboxes:
+            if block1 != block2:
+                block1_bbox = block1[:4]
+                block2_bbox = block2[:4]
+                overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
+                if overlap_box is not None:
+                    block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
+                    if block_to_remove is not None and block_to_remove not in need_remove:
+                        large_block = block1 if block1 != block_to_remove else block2
+                        x1, y1, x2, y2 = large_block[:4]
+                        sx1, sy1, sx2, sy2 = block_to_remove[:4]
+                        x1 = min(x1, sx1)
+                        y1 = min(y1, sy1)
+                        x2 = max(x2, sx2)
+                        y2 = max(y2, sy2)
+                        large_block[:4] = [x1, y1, x2, y2]
+                        need_remove.append(block_to_remove)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes
--- a/magic_pdf/pre_proc/ocr_detect_layout.py
+++ b/magic_pdf/pre_proc/ocr_detect_layout.py
+import fitz
+
+from magic_pdf.layout.layout_sort import get_bboxes_layout
+from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def get_center_point(bbox):
+    """
+    根据边界框坐标信息，计算出该边界框的中心点坐标。
+    Args:
+        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        list: 中心点坐标信息，包含两个元素，分别为x坐标和y坐标。
+    """
+    return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+
+
+def get_area(bbox):
+    """
+    根据边界框坐标信息，计算出该边界框的面积。
+    Args:
+        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        float: 该边界框的面积。
+    """
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def adjust_layouts(layout_bboxes, page_boundry, page_id):
+    # 遍历所有布局框
+    for i in range(len(layout_bboxes)):
+        # 遍历当前布局框之后的布局框
+        for j in range(i + 1, len(layout_bboxes)):
+            # 判断两个布局框是否重叠
+            if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
+                # 计算每个布局框的中心点坐标和面积
+                area_i = get_area(layout_bboxes[i])
+                area_j = get_area(layout_bboxes[j])
+
+                # 较大布局框和较小布局框的赋值
+                if area_i > area_j:
+                    larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
+                else:
+                    larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
+
+                center_large = get_center_point(larger_layout)
+                center_small = get_center_point(smaller_layout)
+                # 计算横向和纵向的距离差
+                distance_x = center_large[0] - center_small[0]
+                distance_y = center_large[1] - center_small[1]
+
+                # 根据距离差判断重叠方向并修正边界
+                if abs(distance_x) > abs(distance_y):  # 左右重叠
+                    if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
+                        larger_layout[0] = smaller_layout[2]+1
+                    if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
+                        larger_layout[2] = smaller_layout[0]-1
+                else:  # 上下重叠
+                    if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
+                        larger_layout[1] = smaller_layout[3]+1
+                    if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
+                        larger_layout[3] = smaller_layout[1]-1
+    # 排序调整布局边界框列表
+    new_bboxes = []
+    for layout_bbox in layout_bboxes:
+        new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
+
+    layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes, layout_tree
+
+
+def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
+    """
+    对输入的布局信息进行解析，提取出每个子布局的边界框，并对所有子布局进行排序调整。
+
+    Args:
+        layout_info (list): 包含子布局信息的列表，每个子布局信息为字典类型，包含'poly'字段，表示子布局的边界框坐标信息。
+
+    Returns:
+        list: 经过排序调整后的所有子布局边界框信息的列表，每个边界框信息为字典类型，包含'layout_bbox'字段，表示边界框的坐标信息。
+
+    """
+    page_id = ocr_page_info['page_info']['page_no']-1
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
+    # 初始化布局边界框列表
+    layout_bboxes = []
+    # 遍历每个子布局
+    for sub_layout in layout_info:
+        # 提取子布局的边界框坐标信息
+        x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
+        bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
+                int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
+
+        # 将子布局的边界框添加到列表中
+        layout_bboxes.append(bbox)
+
+    # 初始化新的布局边界框列表
+    new_layout_bboxes = []
+    # 遍历每个布局边界框
+    for i in range(len(layout_bboxes)):
+        # 初始化标记变量，用于判断当前边界框是否需要保留
+        keep = True
+        # 获取当前边界框的坐标信息
+        box_i = layout_bboxes[i]
+
+        # 遍历其他边界框
+        for j in range(len(layout_bboxes)):
+            # 排除当前边界框自身
+            if i != j:
+                # 获取其他边界框的坐标信息
+                box_j = layout_bboxes[j]
+                # 检测box_i是否被box_j包含
+                if _is_in(box_i, box_j):
+                    # 如果当前边界框被其他边界框包含，则标记为不需要保留
+                    keep = False
+                    # 跳出内层循环
+                    break
+
+        # 如果当前边界框需要保留，则添加到新的布局边界框列表中
+        if keep:
+            new_layout_bboxes.append(layout_bboxes[i])
+
+    # 对新的布局边界框列表进行排序调整
+    page_width = page.rect.width
+    page_height = page.rect.height
+    page_boundry = [0, 0, page_width, page_height]
+    layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes, layout_tree
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
+from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
+                                    _is_in_or_part_overlap_with_area_ratio,
+                                    calculate_overlap_area_in_bbox1_area_ratio)
+from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.ocr_content_type import BlockType, ContentType
+
+
+# 将每一个line中的span从左到右排序
+def line_sort_spans_by_left_to_right(lines):
+    line_objects = []
+    for line in lines:
+        #  按照x0坐标排序
+        line.sort(key=lambda span: span['bbox'][0])
+        line_bbox = [
+            min(span['bbox'][0] for span in line),  # x0
+            min(span['bbox'][1] for span in line),  # y0
+            max(span['bbox'][2] for span in line),  # x1
+            max(span['bbox'][3] for span in line),  # y1
+        ]
+        line_objects.append({
+            'bbox': line_bbox,
+            'spans': line,
+        })
+    return line_objects
+
+
+def merge_spans_to_line(spans):
+    if len(spans) == 0:
+        return []
+    else:
+        # 按照y0坐标排序
+        spans.sort(key=lambda span: span['bbox'][1])
+
+        lines = []
+        current_line = [spans[0]]
+        for span in spans[1:]:
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型，同上
+            if span['type'] in [
+                    ContentType.InterlineEquation, ContentType.Image,
+                    ContentType.Table
+            ] or any(s['type'] in [
+                    ContentType.InterlineEquation, ContentType.Image,
+                    ContentType.Table
+            ] for s in current_line):
+                # 则开始新行
+                lines.append(current_line)
+                current_line = [span]
+                continue
+
+            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5):
+                current_line.append(span)
+            else:
+                # 否则，开始新行
+                lines.append(current_line)
+                current_line = [span]
+
+        # 添加最后一行
+        if current_line:
+            lines.append(current_line)
+
+        return lines
+
+
+def merge_spans_to_line_by_layout(spans, layout_bboxes):
+    lines = []
+    new_spans = []
+    dropped_spans = []
+    for item in layout_bboxes:
+        layout_bbox = item['layout_bbox']
+        # 遍历spans,将每个span放入对应的layout中
+        layout_sapns = []
+        for span in spans:
+            if calculate_overlap_area_in_bbox1_area_ratio(
+                    span['bbox'], layout_bbox) > 0.6:
+                layout_sapns.append(span)
+        # 如果layout_sapns不为空，则放入new_spans中
+        if len(layout_sapns) > 0:
+            new_spans.append(layout_sapns)
+            # 从spans删除已经放入layout_sapns中的span
+            for layout_sapn in layout_sapns:
+                spans.remove(layout_sapn)
+
+    if len(new_spans) > 0:
+        for layout_sapns in new_spans:
+            layout_lines = merge_spans_to_line(layout_sapns)
+            lines.extend(layout_lines)
+
+    # 对line中的span进行排序
+    lines = line_sort_spans_by_left_to_right(lines)
+
+    for span in spans:
+        span['tag'] = DropTag.NOT_IN_LAYOUT
+        dropped_spans.append(span)
+
+    return lines, dropped_spans
+
+
+def merge_lines_to_block(lines):
+    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
+    blocks = []
+    for line in lines:
+        blocks.append({
+            'bbox': line['bbox'],
+            'lines': [line],
+        })
+    return blocks
+
+
+def sort_blocks_by_layout(all_bboxes, layout_bboxes):
+    new_blocks = []
+    sort_blocks = []
+    for item in layout_bboxes:
+        layout_bbox = item['layout_bbox']
+
+        # 遍历blocks,将每个blocks放入对应的layout中
+        layout_blocks = []
+        for block in all_bboxes:
+            # 如果是footnote则跳过
+            if block[7] == BlockType.Footnote:
+                continue
+            block_bbox = block[:4]
+            if calculate_overlap_area_in_bbox1_area_ratio(
+                    block_bbox, layout_bbox) > 0.8:
+                layout_blocks.append(block)
+
+        # 如果layout_blocks不为空，则放入new_blocks中
+        if len(layout_blocks) > 0:
+            new_blocks.append(layout_blocks)
+            # 从all_bboxes删除已经放入layout_blocks中的block
+            for layout_block in layout_blocks:
+                all_bboxes.remove(layout_block)
+
+    # 如果new_blocks不为空，则对new_blocks中每个block进行排序
+    if len(new_blocks) > 0:
+        for bboxes_in_layout_block in new_blocks:
+            bboxes_in_layout_block.sort(
+                key=lambda x: x[1])  # 一个layout内部的box，按照y0自上而下排序
+            sort_blocks.extend(bboxes_in_layout_block)
+
+    # sort_blocks中已经包含了当前页面所有最终留下的block，且已经排好了顺序
+    return sort_blocks
+
+
+def fill_spans_in_blocks(blocks, spans, radio):
+    """将allspans中的span按位置关系，放入blocks中."""
+    block_with_spans = []
+    for block in blocks:
+        block_type = block[7]
+        block_bbox = block[0:4]
+        block_dict = {
+            'type': block_type,
+            'bbox': block_bbox,
+        }
+        if block_type in [
+            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
+            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
+        ]:
+            block_dict["group_id"] = block[-1]
+        block_spans = []
+        for span in spans:
+            span_bbox = span['bbox']
+            if calculate_overlap_area_in_bbox1_area_ratio(
+                    span_bbox, block_bbox) > radio:
+                block_spans.append(span)
+        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
+        # displayed_list = []
+        # text_inline_lines = []
+        # modify_y_axis(block_spans, displayed_list, text_inline_lines)
+        '''模型识别错误的行间公式, type类型转换成行内公式'''
+        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
+        '''bbox去除粘连'''  # 去粘连会影响span的bbox，导致后续fill的时候出错
+        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
+
+        block_dict['spans'] = block_spans
+        block_with_spans.append(block_dict)
+
+        # 从spans删除已经放入block_spans中的span
+        if len(block_spans) > 0:
+            for span in block_spans:
+                spans.remove(span)
+
+    return block_with_spans, spans
+
+
+def fix_block_spans(block_with_spans, img_blocks, table_blocks):
+    """1、img_block和table_block因为包含caption和footnote的关系，存在block的嵌套关系
+    需要将caption和footnote的text_span放入相应img_block和table_block内的
+    caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
+    fix_blocks = []
+    for block in block_with_spans:
+        block_type = block['type']
+
+        if block_type == BlockType.Image:
+            block = fix_image_block(block, img_blocks)
+        elif block_type == BlockType.Table:
+            block = fix_table_block(block, table_blocks)
+        elif block_type in [BlockType.Text, BlockType.Title]:
+            block = fix_text_block(block)
+        elif block_type == BlockType.InterlineEquation:
+            block = fix_interline_block(block)
+        else:
+            continue
+        fix_blocks.append(block)
+    return fix_blocks
+
+
+def fix_block_spans_v2(block_with_spans):
+    """1、img_block和table_block因为包含caption和footnote的关系，存在block的嵌套关系
+    需要将caption和footnote的text_span放入相应img_block和table_block内的
+    caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
+    fix_blocks = []
+    for block in block_with_spans:
+        block_type = block['type']
+
+        if block_type in [BlockType.Text, BlockType.Title,
+                          BlockType.ImageCaption, BlockType.ImageFootnote,
+                          BlockType.TableCaption, BlockType.TableFootnote
+                          ]:
+            block = fix_text_block(block)
+        elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
+            block = fix_interline_block(block)
+        else:
+            continue
+        fix_blocks.append(block)
+    return fix_blocks
+
+
+def fix_discarded_block(discarded_block_with_spans):
+    fix_discarded_blocks = []
+    for block in discarded_block_with_spans:
+        block = fix_text_block(block)
+        fix_discarded_blocks.append(block)
+    return fix_discarded_blocks
+
+
+def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
+    block_spans = []
+    # 如果有img_caption，则将img_block中的text_spans放入img_caption_block中
+    for span in spans:
+        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
+                                                      block_bbox) > 0.6:
+            block_spans.append(span)
+    block_lines = merge_spans_to_line(block_spans)
+    # 对line中的span进行排序
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
+    return block, block_spans
+
+
+def make_body_block(span: dict, block_bbox: list, block_type: str):
+    # 创建body_block
+    body_line = {
+        'bbox': block_bbox,
+        'spans': [span],
+    }
+    body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
+    return body_block
+
+
+def fix_image_block(block, img_blocks):
+    block['blocks'] = []
+    # 遍历img_blocks,找到与当前block匹配的img_block
+    for img_block in img_blocks:
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
+                                                  img_block['bbox'], 0.95):
+
+            # 创建img_body_block
+            for span in block['spans']:
+                if span['type'] == ContentType.Image and img_block[
+                        'img_body_bbox'] == span['bbox']:
+                    # 创建img_body_block
+                    img_body_block = make_body_block(
+                        span, img_block['img_body_bbox'], BlockType.ImageBody)
+                    block['blocks'].append(img_body_block)
+
+                    # 从spans中移除img_body_block中已经放入的span
+                    block['spans'].remove(span)
+                    break
+
+            # 根据list长度，判断img_block中是否有img_caption
+            if img_block['img_caption_bbox'] is not None:
+                img_caption_block, img_caption_spans = merge_spans_to_block(
+                    block['spans'], img_block['img_caption_bbox'],
+                    BlockType.ImageCaption)
+                block['blocks'].append(img_caption_block)
+
+            if img_block['img_footnote_bbox'] is not None:
+                img_footnote_block, img_footnote_spans = merge_spans_to_block(
+                    block['spans'], img_block['img_footnote_bbox'],
+                    BlockType.ImageFootnote)
+                block['blocks'].append(img_footnote_block)
+            break
+    del block['spans']
+    return block
+
+
+def fix_table_block(block, table_blocks):
+    block['blocks'] = []
+    # 遍历table_blocks,找到与当前block匹配的table_block
+    for table_block in table_blocks:
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
+                                                  table_block['bbox'], 0.95):
+
+            # 创建table_body_block
+            for span in block['spans']:
+                if span['type'] == ContentType.Table and table_block[
+                        'table_body_bbox'] == span['bbox']:
+                    # 创建table_body_block
+                    table_body_block = make_body_block(
+                        span, table_block['table_body_bbox'],
+                        BlockType.TableBody)
+                    block['blocks'].append(table_body_block)
+
+                    # 从spans中移除img_body_block中已经放入的span
+                    block['spans'].remove(span)
+                    break
+
+            # 根据list长度，判断table_block中是否有caption
+            if table_block['table_caption_bbox'] is not None:
+                table_caption_block, table_caption_spans = merge_spans_to_block(
+                    block['spans'], table_block['table_caption_bbox'],
+                    BlockType.TableCaption)
+                block['blocks'].append(table_caption_block)
+
+                # 如果table_caption_block_spans不为空
+                if len(table_caption_spans) > 0:
+                    #  一些span已经放入了caption_block中，需要从block['spans']中删除
+                    for span in table_caption_spans:
+                        block['spans'].remove(span)
+
+            # 根据list长度，判断table_block中是否有table_note
+            if table_block['table_footnote_bbox'] is not None:
+                table_footnote_block, table_footnote_spans = merge_spans_to_block(
+                    block['spans'], table_block['table_footnote_bbox'],
+                    BlockType.TableFootnote)
+                block['blocks'].append(table_footnote_block)
+
+            break
+    del block['spans']
+    return block
+
+
+def fix_text_block(block):
+    # 文本block中的公式span都应该转换成行内type
+    for span in block['spans']:
+        if span['type'] == ContentType.InterlineEquation:
+            span['type'] = ContentType.InlineEquation
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
+
+
+def fix_interline_block(block):
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
+from loguru import logger
+
+from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
+    __is_overlaps_y_exceeds_threshold, calculate_iou
+from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+
+
+def remove_overlaps_low_confidence_spans(spans):
+    dropped_spans = []
+    #  删除重叠spans中置信度低的的那些
+    for span1 in spans:
+        for span2 in spans:
+            if span1 != span2:
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
+                        if span1['score'] < span2['score']:
+                            span_need_remove = span1
+                        else:
+                            span_need_remove = span2
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)
+
+    if len(dropped_spans) > 0:
+        for span_need_remove in dropped_spans:
+            spans.remove(span_need_remove)
+            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
+
+    return spans, dropped_spans
+
+
+def remove_overlaps_min_spans(spans):
+    dropped_spans = []
+    #  删除重叠spans中较小的那些
+    for span1 in spans:
+        for span2 in spans:
+            if span1 != span2:
+                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
+                if overlap_box is not None:
+                    span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                    if span_need_remove is not None and span_need_remove not in dropped_spans:
+                        dropped_spans.append(span_need_remove)
+
+    if len(dropped_spans) > 0:
+        for span_need_remove in dropped_spans:
+            spans.remove(span_need_remove)
+            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
+
+    return spans, dropped_spans
+
+
+def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
+    # 遍历spans, 判断是否在removed_span_block_bboxes中
+    # 如果是, 则删除该span 否则, 保留该span
+    need_remove_spans = []
+    for span in spans:
+        for removed_bbox in need_remove_spans_bboxes:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                if span not in need_remove_spans:
+                    need_remove_spans.append(span)
+                    break
+
+    if len(need_remove_spans) > 0:
+        for span in need_remove_spans:
+            spans.remove(span)
+
+    return spans
+
+
+def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
+    dropped_spans = []
+    for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
+        # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
+        need_remove_spans = []
+        for span in spans:
+            # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
+            for removed_bbox in removed_bboxes:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                    need_remove_spans.append(span)
+                    break
+                # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方，如果是,则删除该span
+                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
+                        removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
+                    need_remove_spans.append(span)
+                    break
+
+        for span in need_remove_spans:
+            spans.remove(span)
+            span['tag'] = drop_tag
+            dropped_spans.append(span)
+
+    return spans, dropped_spans
+
+
+def adjust_bbox_for_standalone_block(spans):
+    # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
+    for sb_span in spans:
+        if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            for text_span in spans:
+                if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
+                    # 判断span2的纵向高度是否被span所覆盖
+                    if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
+                        # 判断span2是否在span左边
+                        if text_span['bbox'][0] < sb_span['bbox'][0]:
+                            # 调整span的y0和span2的y0一致
+                            sb_span['bbox'][1] = text_span['bbox'][1]
+    return spans
+
+
+def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
+    # displayed_list = []
+    # 如果spans为空,则不处理
+    if len(spans) == 0:
+        pass
+    else:
+        spans.sort(key=lambda span: span['bbox'][1])
+
+        lines = []
+        current_line = [spans[0]]
+        if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            displayed_list.append(spans[0])
+
+        line_first_y0 = spans[0]["bbox"][1]
+        line_first_y = spans[0]["bbox"][3]
+        # 用于给行间公式搜索
+        # text_inline_lines = []
+        for span in spans[1:]:
+            # if span.get("content","") == "78.":
+            #     print("debug")
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型，同上
+            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
+                    current_line):
+                # 传入
+                if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+                    displayed_list.append(span)
+                # 则开始新行
+                lines.append(current_line)
+                if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+                    text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                current_line = [span]
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+                continue
+
+            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+                if span["type"] == "text":
+                    line_first_y0 = span["bbox"][1]
+                    line_first_y = span["bbox"][3]
+                current_line.append(span)
+
+            else:
+                # 否则，开始新行
+                lines.append(current_line)
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                current_line = [span]
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+
+            # 添加最后一行
+        if current_line:
+            lines.append(current_line)
+            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+        for line in text_inline_lines:
+            # 按照x0坐标排序
+            current_line = line[0]
+            current_line.sort(key=lambda span: span['bbox'][0])
+
+        # 调整每一个文字行内bbox统一
+        for line in text_inline_lines:
+            current_line, (line_first_y0, line_first_y) = line
+            for span in current_line:
+                span["bbox"][1] = line_first_y0
+                span["bbox"][3] = line_first_y
+
+        # return spans, displayed_list, text_inline_lines
+
+
+def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
+    # 错误行间公式转行内公式
+    j = 0
+    for i in range(len(displayed_list)):
+        # if i == 8:
+        #     print("debug")
+        span = displayed_list[i]
+        span_y0, span_y = span["bbox"][1], span["bbox"][3]
+
+        while j < len(text_inline_lines):
+            text_line = text_inline_lines[j]
+            y0, y1 = text_line[1]
+            if (
+                    span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
+            ) and __is_overlaps_y_exceeds_threshold(
+                span['bbox'], (0, y0, 0, y1)
+            ):
+                # 调整公式类型
+                if span["type"] == ContentType.InterlineEquation:
+                    # 最后一行是行间公式
+                    if j + 1 >= len(text_inline_lines):
+                        span["type"] = ContentType.InlineEquation
+                        span["bbox"][1] = y0
+                        span["bbox"][3] = y1
+                    else:
+                        # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
+                        y0_next, y1_next = text_inline_lines[j + 1][1]
+                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
+                                y1 - y0) > span_y - span_y0:
+                            span["type"] = ContentType.InlineEquation
+                            span["bbox"][1] = y0
+                            span["bbox"][3] = y1
+                break
+            elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
+                                                                                                (0, y0, 0, y1)):
+                break
+            else:
+                j += 1
+
+    return spans
+
+
+def get_qa_need_list(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+    inline_equations = []
+
+    for block in blocks:
+        for line in block["lines"]:
+            for span in line["spans"]:
+                if span["type"] == ContentType.Image:
+                    images.append(span)
+                elif span["type"] == ContentType.Table:
+                    tables.append(span)
+                elif span["type"] == ContentType.InlineEquation:
+                    inline_equations.append(span)
+                elif span["type"] == ContentType.InterlineEquation:
+                    interline_equations.append(span)
+                else:
+                    continue
+    return images, tables, interline_equations, inline_equations
+
+
+def get_qa_need_list_v2(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+
+    for block in blocks:
+        if block["type"] == BlockType.Image:
+            images.append(block)
+        elif block["type"] == BlockType.Table:
+            tables.append(block)
+        elif block["type"] == BlockType.InterlineEquation:
+            interline_equations.append(block)
+    return images, tables, interline_equations
--- a/magic_pdf/pre_proc/pdf_pre_filter.py
+++ b/magic_pdf/pre_proc/pdf_pre_filter.py
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
+from magic_pdf.libs.drop_reason import DropReason
+
+
+def __area(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
+    """
+    检查page是包含有颜色背景的矩形
+    """
+    color_bg_rect = []
+    p_width, p_height = page.rect.width, page.rect.height
+    
+    # 先找到最大的带背景矩形
+    blocks = page.get_cdrawings()
+    for block in blocks:
+        
+        if 'fill' in block and block['fill']: # 过滤掉透明的
+            fill = list(block['fill'])
+            fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
+            if fill==(1.0,1.0,1.0):
+                continue
+            rect = block['rect']
+            # 过滤掉特别小的矩形
+            if __area(rect) < 10*10:
+                continue
+            # 为了防止是svg图片上的色块，这里过滤掉这类
+            
+            if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
+                continue
+            color_bg_rect.append(rect)
+            
+    # 找到最大的背景矩形
+    if len(color_bg_rect) > 0:
+        max_rect = max(color_bg_rect, key=lambda x:__area(x))
+        max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
+        # 判断最大的背景矩形是否包含超过3行文字，或者50个字 TODO
+        if max_rect[2]-max_rect[0] > 0.2*p_width and  max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
+            #看是否有文本块落入到这个矩形中
+            for text_block in text_blocks:
+                box = text_block['bbox']
+                box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
+                if _is_in(box_int, max_rect_int):
+                    return True
+    
+    return False
+
+
+def __is_table_overlap_text_block(text_blocks, table_bbox):
+    """
+    检查table_bbox是否覆盖了text_blocks里的文本块
+    TODO
+    """
+    for text_block in text_blocks:
+        box = text_block['bbox']
+        if _is_in_or_part_overlap(table_bbox, box):
+            return True
+    return False
+
+
+def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
+    """
+    return:(True|False, err_msg)
+        True, 如果pdf符合要求
+        False, 如果pdf不符合要求
+        
+    """
+    if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
+        return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
+
+    
+    return True, None
\ No newline at end of file
--- a/magic_pdf/pre_proc/post_layout_split.py
+++ b/magic_pdf/pre_proc/post_layout_split.py
--- a/magic_pdf/pre_proc/remove_bbox_overlap.py
+++ b/magic_pdf/pre_proc/remove_bbox_overlap.py
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
+from magic_pdf.libs.drop_reason import DropReason
+
+def _remove_overlap_between_bbox(bbox1, bbox2):
+   if _is_part_overlap(bbox1, bbox2):
+        ix0, iy0, ix1, iy1 = bbox1
+        x0, y0, x1, y1 = bbox2
+
+        diff_x = min(x1, ix1) - max(x0, ix0)
+        diff_y = min(y1, iy1) - max(y0, iy0)
+
+        if diff_y > diff_x:
+            if x1 >= ix1:
+                mid = (x0 + ix1) // 2
+                ix1 = min(mid - 0.25, ix1)
+                x0 = max(mid + 0.25, x0)
+            else:
+                mid = (ix0 + x1) // 2
+                ix0 = max(mid + 0.25, ix0)
+                x1 = min(mid - 0.25, x1)
+        else:
+            if y1 >= iy1:
+                mid = (y0 + iy1) // 2
+                y0 = max(mid + 0.25, y0)
+                iy1 = min(iy1, mid-0.25)
+            else:
+                mid = (iy0 + y1) // 2
+                y1 = min(y1, mid-0.25)
+                iy0 = max(mid + 0.25, iy0)
+
+        if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
+            bbox1 = [ix0, iy0, ix1, iy1]
+            bbox2 = [x0, y0, x1, y1]
+            return bbox1, bbox2, None
+        else:
+            return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
+   else:
+       return bbox1, bbox2, None
+
+
+def _remove_overlap_between_bboxes(arr):
+    drop_reasons = []
+    N = len(arr)
+    keeps = [True] * N
+    res = [None] * N
+    for i in range(N):
+        for j in range(N):
+            if i == j:
+                continue
+            if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
+                keeps[i] = False
+
+    for idx, v in enumerate(arr):
+        if not keeps[idx]:
+            continue
+        for i in range(N):
+            if res[i] is None:
+                continue
+        
+            bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
+            if drop_reason is None:
+                v["bbox"] = bbox1
+                res[i]["bbox"] = bbox2
+            else:
+                if v["score"] > res[i]["score"]:
+                    keeps[i] = False
+                    res[i] = None
+                else:
+                    keeps[idx] = False
+                drop_reasons.append(drop_reasons)
+        if keeps[idx]:
+            res[idx] = v
+    return res, drop_reasons
+
+
+def remove_overlap_between_bbox_for_span(spans):
+    arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
+    res, drop_reasons = _remove_overlap_between_bboxes(arr)
+    ret = []
+    for i in range(len(res)):
+        if res[i] is None:
+            continue
+        spans[i]["bbox"] = res[i]["bbox"]
+        ret.append(spans[i])
+    return ret, drop_reasons
+
+
+def remove_overlap_between_bbox_for_block(all_bboxes):
+    arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
+    res, drop_reasons = _remove_overlap_between_bboxes(arr)
+    ret = []
+    for i in range(len(res)):
+        if res[i] is None:
+            continue
+        all_bboxes[i][:4] = res[i]["bbox"]
+        ret.append(all_bboxes[i])
+    return ret, drop_reasons
+
--- a/magic_pdf/pre_proc/remove_colored_strip_bbox.py
+++ b/magic_pdf/pre_proc/remove_colored_strip_bbox.py
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
+from loguru import logger
+
+from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
+
+
+def __area(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+
+def rectangle_position_determination(rect, p_width):
+    """
+    判断矩形是否在页面中轴线附近。
+
+    Args:
+        rect (list): 矩形坐标，格式为[x1, y1, x2, y2]。
+        p_width (int): 页面宽度。
+
+    Returns:
+        bool: 若矩形在页面中轴线附近则返回True，否则返回False。
+    """
+    # 页面中轴线x坐标
+    x_axis = p_width / 2
+    # 矩形是否跨越中轴线
+    is_span = rect[0] < x_axis and rect[2] > x_axis
+    if is_span:
+        return True
+    else:
+        # 矩形与中轴线的距离，只算近的那一边
+        distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
+        # 判断矩形与中轴线的距离是否小于页面宽度的20%
+        if distance < p_width * 0.2:
+            return True
+        else:
+            return False
+
+def remove_colored_strip_textblock(remain_text_blocks, page):
+    """
+    根据页面中特定颜色和大小过滤文本块，将符合条件的文本块从remain_text_blocks中移除，并返回移除的文本块列表colored_strip_textblock。
+
+    Args:
+        remain_text_blocks (list): 剩余文本块列表。
+        page (Page): 页面对象。
+
+    Returns:
+        tuple: 剩余文本块列表和移除的文本块列表。
+    """
+    colored_strip_textblocks = []  # 先构造一个空的返回
+    if len(remain_text_blocks) > 0:
+        p_width, p_height = page.rect.width, page.rect.height
+        blocks = page.get_cdrawings()
+        colored_strip_bg_rect = []
+        for block in blocks:
+            is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)  # 过滤掉透明的
+            rect = block['rect']
+            area_is_large_enough = __area(rect) > 100  # 过滤掉特别小的矩形
+            rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
+            in_upper_half_page = rect[3] < p_height * 0.3  # 找到位于页面上半部分的矩形，下边界小于页面高度的30%
+            aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4  # 找到长宽比超过4的矩形
+
+            if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
+                colored_strip_bg_rect.append(rect)
+
+        if len(colored_strip_bg_rect) > 0:
+            for colored_strip_block_bbox in colored_strip_bg_rect:
+                for text_block in remain_text_blocks:
+                    text_bbox = text_block['bbox']
+                    if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
+                        logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
+                        text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
+                        colored_strip_textblocks.append(text_block)
+
+                if len(colored_strip_textblocks) > 0:
+                    for colored_strip_textblock in colored_strip_textblocks:
+                        if colored_strip_textblock in remain_text_blocks:
+                            remain_text_blocks.remove(colored_strip_textblock)
+
+    return remain_text_blocks, colored_strip_textblocks
+
--- a/magic_pdf/pre_proc/remove_footer_header.py
+++ b/magic_pdf/pre_proc/remove_footer_header.py
+import re
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap
+from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
+
+
+def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
+                                   page_no_bboxs, page_w, page_h):
+    """
+    删除页眉页脚，页码
+    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
+    """
+    header = []
+    footer = []
+    if len(header) == 0:
+        model_header = header_bboxs
+        if model_header:
+            x0 = min([x for x, _, _, _ in model_header])
+            y0 = min([y for _, y, _, _ in model_header])
+            x1 = max([x1 for _, _, x1, _ in model_header])
+            y1 = max([y1 for _, _, _, y1 in model_header])
+            header = [x0, y0, x1, y1]
+    if len(footer) == 0:
+        model_footer = footer_bboxs
+        if model_footer:
+            x0 = min([x for x, _, _, _ in model_footer])
+            y0 = min([y for _, y, _, _ in model_footer])
+            x1 = max([x1 for _, _, x1, _ in model_footer])
+            y1 = max([y1 for _, _, _, y1 in model_footer])
+            footer = [x0, y0, x1, y1]
+
+    header_y0 = 0 if len(header) == 0 else header[3]
+    footer_y0 = page_h if len(footer) == 0 else footer[1]
+    if page_no_bboxs:
+        top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
+        btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
+
+        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
+        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
+
+        header_y0 = max(header_y0, top_max_y0)
+        footer_y0 = min(footer_y0, btn_min_y1)
+
+    content_boundry = [0, header_y0, page_w, footer_y0]
+
+    header = [0, 0, page_w, header_y0]
+    footer = [0, footer_y0, page_w, page_h]
+
+    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
+    text_block_to_remove = []
+    # 首先检查每个textblock
+    for blk in text_raw_blocks:
+        if len(blk['lines']) > 0:
+            for line in blk['lines']:
+                line_del = []
+                for span in line['spans']:
+                    span_del = []
+                    if span['bbox'][3] < header_y0:
+                        span_del.append(span)
+                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
+                        span_del.append(span)
+                for span in span_del:
+                    line['spans'].remove(span)
+                if not line['spans']:
+                    line_del.append(line)
+
+            for line in line_del:
+                blk['lines'].remove(line)
+        else:
+            # if not blk['lines']:
+            blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
+            text_block_to_remove.append(blk)
+
+    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
+    page_no_block_2_remove = []
+    if page_no_bboxs:
+        for pagenobox in page_no_bboxs:
+            for block in text_raw_blocks:
+                if _is_in_or_part_overlap(pagenobox, block['bbox']):  # 在span级别删除页码
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
+                                # span['text'] = ''
+                                span['tag'] = PAGE_NO
+                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
+                                if len(line['spans']) == 1 and len(block['lines']) == 1:
+                                    page_no_block_2_remove.append(block)
+    else:
+        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
+        if len(text_raw_blocks) > 0:
+            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
+            last_block = text_raw_blocks[0]
+            if len(last_block['lines']) == 1:
+                last_line = last_block['lines'][0]
+                if len(last_line['spans']) == 1:
+                    last_span = last_line['spans'][0]
+                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
+                                                                                                                last_span[
+                                                                                                                    'text']):
+                        last_span['tag'] = PAGE_NO
+                        page_no_block_2_remove.append(last_block)
+
+    for b in page_no_block_2_remove:
+        text_block_to_remove.append(b)
+
+    for blk in text_block_to_remove:
+        if blk in text_raw_blocks:
+            text_raw_blocks.remove(blk)
+
+    text_block_remain = text_raw_blocks
+    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+
+    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+
+    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
--- a/magic_pdf/pre_proc/remove_rotate_bbox.py
+++ b/magic_pdf/pre_proc/remove_rotate_bbox.py
+import math
+
+from magic_pdf.libs.boxbase import is_vbox_on_side
+from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
+
+
+def detect_non_horizontal_texts(result_dict):
+    """
+    This function detects watermarks and vertical margin notes in the document.
+
+    Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+    If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
+    If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
+
+    Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
+    If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
+
+
+    Parameters
+    ----------
+    result_dict : dict
+        The result dictionary.
+
+    Returns
+    -------
+    result_dict : dict
+        The updated result dictionary.
+    """
+    # Dictionary to store information about potential watermarks
+    potential_watermarks = {}
+    potential_margin_notes = {}
+
+    for page_id, page_content in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_id, block_data in page_content.items():
+                if block_id.startswith("block_"):
+                    if "dir" in block_data:
+                        coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
+
+                        angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
+                        angle = abs(math.degrees(angle))
+
+                        if angle > 5 and angle < 85:  # Check if direction is watermarks
+                            if coordinates_text in potential_watermarks:
+                                potential_watermarks[coordinates_text] += 1
+                            else:
+                                potential_watermarks[coordinates_text] = 1
+
+                        if angle > 85 and angle < 105:  # Check if direction is vertical
+                            if coordinates_text in potential_margin_notes:
+                                potential_margin_notes[coordinates_text] += 1  # Increment count
+                            else:
+                                potential_margin_notes[coordinates_text] = 1  # Initialize count
+
+    # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+    watermark_threshold = len(result_dict) // 2
+    watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
+
+    # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+    margin_note_threshold = len(result_dict) // 2
+    margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
+
+    # Add watermark information to the result dictionary
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_id, block_data in blocks.items():
+                coordinates_text = (block_data["bbox"], block_data["text"])
+                if coordinates_text in watermarks:
+                    block_data["is_watermark"] = 1
+                else:
+                    block_data["is_watermark"] = 0
+
+                if coordinates_text in margin_notes:
+                    block_data["is_vertical_margin_note"] = 1
+                else:
+                    block_data["is_vertical_margin_note"] = 0
+
+    return result_dict
+
+
+"""
+1. 当一个block里全部文字都不是dir=(1,0)，这个block整体去掉
+2. 当一个block里全部文字都是dir=(1,0)，但是每行只有一个字，这个block整体去掉。这个block必须出现在页面的四周，否则不去掉
+"""
+import re
+
+def __is_a_word(sentence):
+    # 如果输入是中文并且长度为1，则返回True
+    if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
+        return True
+    # 判断是否为单个英文单词或字符（包括ASCII标点）
+    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
+        return True
+    else:
+        return False
+
+
+def __get_text_color(num):
+    """获取字体的颜色RGB值"""
+    blue = num & 255
+    green = (num >> 8) & 255
+    red = (num >> 16) & 255
+    return red, green, blue
+
+
+def __is_empty_side_box(text_block):
+    """
+    是否是边缘上的空白没有任何内容的block
+    """
+    for line in text_block['lines']:
+        for span in line['spans']:
+            font_color = span['color']
+            r,g,b = __get_text_color(font_color)
+            if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
+                return False
+            
+    return True
+
+
+def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
+    """
+    返回删除了垂直，水印，旋转的textblock
+    删除的内容打上tag返回
+    """
+    removed_text_block = []
+    
+    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+        lines = block['lines']
+        block_bbox = block['bbox']
+        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
+           continue
+        
+        if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
+            is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1)  # 测试bbox在垂直方向是不是x0都相等，也就是在垂直方向排列.同时必须大于等于2个字
+            
+            if is_box_valign:
+                block['tag'] = VERTICAL_TEXT
+                removed_text_block.append(block)
+                continue
+        
+        for line in lines:
+            if line['dir']!=(1,0):
+                block['tag'] = ROTATE_TEXT
+                removed_text_block.append(block) # 只要有一个line不是dir=(1,0)，就把整个block都删掉
+                break
+        
+    for block in removed_text_block:
+        pymu_text_block.remove(block)
+    
+    return pymu_text_block, removed_text_block
+
+def get_side_boundry(rotate_bbox, page_width, page_height):
+    """
+    根据rotate_bbox，返回页面的左右正文边界
+    """
+    left_x = 0
+    right_x = page_width
+    for x in rotate_bbox:
+        box = x['bbox']
+        if box[2]<page_width/2:
+            left_x = max(left_x, box[2])
+        else:
+            right_x = min(right_x, box[0])
+            
+    return left_x+1, right_x-1
+
+
+def remove_side_blank_block(pymu_text_block, page_width, page_height):
+    """
+    删除页面两侧的空白block
+    """
+    removed_text_block = []
+    
+    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+        block_bbox = block['bbox']
+        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
+           continue
+            
+        if __is_empty_side_box(block):
+            block['tag'] = EMPTY_SIDE_BLOCK
+            removed_text_block.append(block)
+            continue
+        
+    for block in removed_text_block:
+        pymu_text_block.remove(block)
+    
+    return pymu_text_block, removed_text_block
\ No newline at end of file