Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files

Update magic_pdf/init.py, magic_pdf/config.ini, magic_pdf/tmp.py,...
Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
2df265c8 · zhougaofeng · 826086d2 · 2df265c8 · 2df265c8 · 2df265c8
Commit 2df265c8 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/libs/Constants.py
+++ b/magic_pdf/libs/Constants.py
+"""
+span维度自定义字段
+"""
+# span是否是跨页合并的
+CROSS_PAGE = "cross_page"
+
+"""
+block维度自定义字段
+"""
+# block中lines是否被删除
+LINES_DELETED = "lines_deleted"
+
+# table recognition max time default value
+TABLE_MAX_TIME_VALUE = 400
+
+# pp_table_result_max_length
+TABLE_MAX_LEN = 480
+
+# table master structure dict
+TABLE_MASTER_DICT = "table_master_structure_dict.txt"
+
+# table master dir
+TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
+
+# pp detect model dir
+DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
+
+# pp rec model dir
+REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
+
+# pp rec char dict path
+REC_CHAR_DICT = "ppocr_keys_v1.txt"
+
+# pp rec copy rec directory
+PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
+
+# pp rec copy det directory
+PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
+
+
+class MODEL_NAME:
+    # pp table structure algorithm
+    TABLE_MASTER = "tablemaster"
+    # struct eqtable
+    STRUCT_EQTABLE = "struct_eqtable"
+
+    DocLayout_YOLO = "doclayout_yolo"
+
+    LAYOUTLMv3 = "layoutlmv3"
+
+    YOLO_V8_MFD = "yolo_v8_mfd"
+
+    UniMerNet_v2_Small = "unimernet_small"
\ No newline at end of file
--- a/magic_pdf/libs/MakeContentConfig.py
+++ b/magic_pdf/libs/MakeContentConfig.py
+class MakeMode:
+    MM_MD = "mm_markdown"
+    NLP_MD = "nlp_markdown"
+    STANDARD_FORMAT = "standard_format"
+
+
+class DropMode:
+    WHOLE_PDF = "whole_pdf"
+    SINGLE_PAGE = "single_page"
+    NONE = "none"
+    NONE_WITH_REASON = "none_with_reason"
--- a/magic_pdf/libs/ModelBlockTypeEnum.py
+++ b/magic_pdf/libs/ModelBlockTypeEnum.py
+from enum import Enum
+
+class ModelBlockTypeEnum(Enum):
+    TITLE = 0
+    PLAIN_TEXT = 1
+    ABANDON = 2
+    ISOLATE_FORMULA = 8
+    EMBEDDING = 13
+    ISOLATED = 14
\ No newline at end of file
--- a/magic_pdf/libs/__init__.py
+++ b/magic_pdf/libs/__init__.py
--- a/magic_pdf/libs/boxbase.py
+++ b/magic_pdf/libs/boxbase.py
+import math
+
+
+def _is_in_or_part_overlap(box1, box2) -> bool:
+    """两个bbox是否有部分重叠或者包含."""
+    if box1 is None or box2 is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    return not (x1_1 < x0_2 or  # box1在box2的左边
+                x0_1 > x1_2 or  # box1在box2的右边
+                y1_1 < y0_2 or  # box1在box2的上边
+                y0_1 > y1_2)  # box1在box2的下边
+
+
+def _is_in_or_part_overlap_with_area_ratio(box1,
+                                           box2,
+                                           area_ratio_threshold=0.6):
+    """判断box1是否在box2里面，或者box1和box2有部分重叠，且重叠面积占box1的比例超过area_ratio_threshold."""
+    if box1 is None or box2 is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    if not _is_in_or_part_overlap(box1, box2):
+        return False
+
+    # 计算重叠面积
+    x_left = max(x0_1, x0_2)
+    y_top = max(y0_1, y0_2)
+    x_right = min(x1_1, x1_2)
+    y_bottom = min(y1_1, y1_2)
+    overlap_area = (x_right - x_left) * (y_bottom - y_top)
+
+    # 计算box1的面积
+    box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
+
+    return overlap_area / box1_area > area_ratio_threshold
+
+
+def _is_in(box1, box2) -> bool:
+    """box1是否完全在box2里面."""
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    return (x0_1 >= x0_2 and  # box1的左边界不在box2的左边外
+            y0_1 >= y0_2 and  # box1的上边界不在box2的上边外
+            x1_1 <= x1_2 and  # box1的右边界不在box2的右边外
+            y1_1 <= y1_2)  # box1的下边界不在box2的下边外
+
+
+def _is_part_overlap(box1, box2) -> bool:
+    """两个bbox是否有部分重叠，但不完全包含."""
+    if box1 is None or box2 is None:
+        return False
+
+    return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
+
+
+def _left_intersect(left_box, right_box):
+    """检查两个box的左边界是否有交集，也就是left_box的右边界是否在right_box的左边界内."""
+    if left_box is None or right_box is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+
+    return x1_1 > x0_2 and x0_1 < x0_2 and (y0_1 <= y0_2 <= y1_1
+                                            or y0_1 <= y1_2 <= y1_1)
+
+
+def _right_intersect(left_box, right_box):
+    """检查box是否在右侧边界有交集，也就是left_box的左边界是否在right_box的右边界内."""
+    if left_box is None or right_box is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+
+    return x0_1 < x1_2 and x1_1 > x1_2 and (y0_1 <= y0_2 <= y1_1
+                                            or y0_1 <= y1_2 <= y1_1)
+
+
+def _is_vertical_full_overlap(box1, box2, x_torlence=2):
+    """x方向上：要么box1包含box2, 要么box2包含box1。不能部分包含 y方向上：box1和box2有重叠."""
+    # 解析box的坐标
+    x11, y11, x12, y12 = box1  # 左上角和右下角的坐标 (x1, y1, x2, y2)
+    x21, y21, x22, y22 = box2
+
+    # 在x轴方向上，box1是否包含box2 或 box2包含box1
+    contains_in_x = (x11 - x_torlence <= x21 and x12 + x_torlence >= x22) or (
+        x21 - x_torlence <= x11 and x22 + x_torlence >= x12)
+
+    # 在y轴方向上，box1和box2是否有重叠
+    overlap_in_y = not (y12 < y21 or y11 > y22)
+
+    return contains_in_x and overlap_in_y
+
+
+def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
+    """检查box1下方和box2的上方有轻微的重叠，轻微程度收到y_tolerance的限制 这个函数和_is_vertical-
+    full_overlap的区别是，这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度."""
+    if box1 is None or box2 is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    tolerance_margin = 2
+    is_xdir_full_overlap = (
+        (x0_1 - tolerance_margin <= x0_2 <= x1_1 + tolerance_margin
+         and x0_1 - tolerance_margin <= x1_2 <= x1_1 + tolerance_margin)
+        or (x0_2 - tolerance_margin <= x0_1 <= x1_2 + tolerance_margin
+            and x0_2 - tolerance_margin <= x1_1 <= x1_2 + tolerance_margin))
+
+    return y0_2 < y1_1 and 0 < (y1_1 -
+                                y0_2) < y_tolerance and is_xdir_full_overlap
+
+
+def _is_left_overlap(
+    box1,
+    box2,
+):
+    """检查box1的左侧是否和box2有重叠 在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系，也就是无论box1在box2下
+    方还是box2在box1下方，都可以检测到重叠。 X方向上."""
+
+    def __overlap_y(Ay1, Ay2, By1, By2):
+        return max(0, min(Ay2, By2) - max(Ay1, By1))
+
+    if box1 is None or box2 is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
+    ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1 - y0_1 != 0 else 0
+    ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2 - y0_2 != 0 else 0
+    vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
+
+    # vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
+    return x0_1 <= x0_2 <= x1_1 and vertical_overlap_cond
+
+
+def __is_overlaps_y_exceeds_threshold(bbox1,
+                                      bbox2,
+                                      overlap_ratio_threshold=0.8):
+    """检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
+    _, y0_1, _, y1_1 = bbox1
+    _, y0_2, _, y1_2 = bbox2
+
+    overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
+    height1, height2 = y1_1 - y0_1, y1_2 - y0_2
+    # max_height = max(height1, height2)
+    min_height = min(height1, height2)
+
+    return (overlap / min_height) > overlap_ratio_threshold
+
+
+def calculate_iou(bbox1, bbox2):
+    """计算两个边界框的交并比(IOU)。
+
+    Args:
+        bbox1 (list[float]): 第一个边界框的坐标，格式为 [x1, y1, x2, y2]，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+        bbox2 (list[float]): 第二个边界框的坐标，格式与 `bbox1` 相同。
+
+    Returns:
+        float: 两个边界框的交并比(IOU)，取值范围为 [0, 1]。
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+
+    # The area of both rectangles
+    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+
+    # Compute the intersection over union by taking the intersection area
+    # and dividing it by the sum of both areas minus the intersection area
+    iou = intersection_area / float(bbox1_area + bbox2_area -
+                                    intersection_area)
+    return iou
+
+
+def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
+    """计算box1和box2的重叠面积占最小面积的box的比例."""
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    min_box_area = min([(bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]),
+                        (bbox2[3] - bbox2[1]) * (bbox2[2] - bbox2[0])])
+    if min_box_area == 0:
+        return 0
+    else:
+        return intersection_area / min_box_area
+
+
+def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
+    """计算box1和box2的重叠面积占bbox1的比例."""
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+    if bbox1_area == 0:
+        return 0
+    else:
+        return intersection_area / bbox1_area
+
+
+def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
+    """通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
+    如果比例大于ratio，则返回小的那个bbox, 否则返回None."""
+    x1_min, y1_min, x1_max, y1_max = bbox1
+    x2_min, y2_min, x2_max, y2_max = bbox2
+    area1 = (x1_max - x1_min) * (y1_max - y1_min)
+    area2 = (x2_max - x2_min) * (y2_max - y2_min)
+    overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+    if overlap_ratio > ratio:
+        if area1 <= area2:
+            return bbox1
+        else:
+            return bbox2
+    else:
+        return None
+
+
+def get_bbox_in_boundary(bboxes: list, boundary: tuple) -> list:
+    x0, y0, x1, y1 = boundary
+    new_boxes = [
+        box for box in bboxes
+        if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1
+    ]
+    return new_boxes
+
+
+def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
+    """判断一个bbox是否在pdf页面的边缘."""
+    x0, x1 = bbox[0], bbox[2]
+    if x1 <= width * side_threshold or x0 >= width * (1 - side_threshold):
+        return True
+    return False
+
+
+def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
+    tolerance_margin = 4
+    top_boxes = [
+        box for box in pymu_blocks
+        if obj_bbox[1] - box['bbox'][3] >= -tolerance_margin
+        and not _is_in(box['bbox'], obj_bbox)
+    ]
+    # 然后找到X方向上有互相重叠的
+    top_boxes = [
+        box for box in top_boxes if any([
+            obj_bbox[0] - tolerance_margin <= box['bbox'][0] <= obj_bbox[2] +
+            tolerance_margin, obj_bbox[0] -
+            tolerance_margin <= box['bbox'][2] <= obj_bbox[2] +
+            tolerance_margin, box['bbox'][0] -
+            tolerance_margin <= obj_bbox[0] <= box['bbox'][2] +
+            tolerance_margin, box['bbox'][0] -
+            tolerance_margin <= obj_bbox[2] <= box['bbox'][2] +
+            tolerance_margin
+        ])
+    ]
+
+    # 然后找到y1最大的那个
+    if len(top_boxes) > 0:
+        top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
+        return top_boxes[0]
+    else:
+        return None
+
+
+def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
+    bottom_boxes = [
+        box for box in pymu_blocks if box['bbox'][1] -
+        obj_bbox[3] >= -2 and not _is_in(box['bbox'], obj_bbox)
+    ]
+    # 然后找到X方向上有互相重叠的
+    bottom_boxes = [
+        box for box in bottom_boxes if any([
+            obj_bbox[0] - 2 <= box['bbox'][0] <= obj_bbox[2] + 2, obj_bbox[0] -
+            2 <= box['bbox'][2] <= obj_bbox[2] + 2, box['bbox'][0] -
+            2 <= obj_bbox[0] <= box['bbox'][2] + 2, box['bbox'][0] -
+            2 <= obj_bbox[2] <= box['bbox'][2] + 2
+        ])
+    ]
+
+    # 然后找到y0最小的那个
+    if len(bottom_boxes) > 0:
+        bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
+        return bottom_boxes[0]
+    else:
+        return None
+
+
+def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """寻找左侧最近的文本block."""
+    left_boxes = [
+        box for box in pymu_blocks if obj_bbox[0] -
+        box['bbox'][2] >= -2 and not _is_in(box['bbox'], obj_bbox)
+    ]
+    # 然后找到X方向上有互相重叠的
+    left_boxes = [
+        box for box in left_boxes if any([
+            obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] -
+            2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] -
+            2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] -
+            2 <= obj_bbox[3] <= box['bbox'][3] + 2
+        ])
+    ]
+
+    # 然后找到x1最大的那个
+    if len(left_boxes) > 0:
+        left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
+        return left_boxes[0]
+    else:
+        return None
+
+
+def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """寻找右侧最近的文本block."""
+    right_boxes = [
+        box for box in pymu_blocks if box['bbox'][0] -
+        obj_bbox[2] >= -2 and not _is_in(box['bbox'], obj_bbox)
+    ]
+    # 然后找到X方向上有互相重叠的
+    right_boxes = [
+        box for box in right_boxes if any([
+            obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] -
+            2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] -
+            2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] -
+            2 <= obj_bbox[3] <= box['bbox'][3] + 2
+        ])
+    ]
+
+    # 然后找到x0最小的那个
+    if len(right_boxes) > 0:
+        right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
+        return right_boxes[0]
+    else:
+        return None
+
+
+def bbox_relative_pos(bbox1, bbox2):
+    """判断两个矩形框的相对位置关系.
+
+    Args:
+        bbox1: 一个四元组，表示第一个矩形框的左上角和右下角的坐标，格式为(x1, y1, x1b, y1b)
+        bbox2: 一个四元组，表示第二个矩形框的左上角和右下角的坐标，格式为(x2, y2, x2b, y2b)
+
+    Returns:
+        一个四元组，表示矩形框1相对于矩形框2的位置关系，格式为(left, right, bottom, top)
+        其中，left表示矩形框1是否在矩形框2的左侧，right表示矩形框1是否在矩形框2的右侧，
+        bottom表示矩形框1是否在矩形框2的下方，top表示矩形框1是否在矩形框2的上方
+    """
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+
+    left = x2b < x1
+    right = x1b < x2
+    bottom = y2b < y1
+    top = y1b < y2
+    return left, right, bottom, top
+
+
+def bbox_distance(bbox1, bbox2):
+    """计算两个矩形框的距离。
+
+    Args:
+        bbox1 (tuple): 第一个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+        bbox2 (tuple): 第二个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+
+    Returns:
+        float: 矩形框之间的距离。
+    """
+
+    def dist(point1, point2):
+        return math.sqrt((point1[0] - point2[0])**2 +
+                         (point1[1] - point2[1])**2)
+
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+
+    left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+
+    if top and left:
+        return dist((x1, y1b), (x2b, y2))
+    elif left and bottom:
+        return dist((x1, y1), (x2b, y2b))
+    elif bottom and right:
+        return dist((x1b, y1), (x2, y2b))
+    elif right and top:
+        return dist((x1b, y1b), (x2, y2))
+    elif left:
+        return x1 - x2b
+    elif right:
+        return x2 - x1b
+    elif bottom:
+        return y1 - y2b
+    elif top:
+        return y2 - y1b
+    return 0.0
+
+
+def box_area(bbox):
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def get_overlap_area(bbox1, bbox2):
+    """计算box1和box2的重叠面积占bbox1的比例."""
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    return (x_right - x_left) * (y_bottom - y_top)
+
+
+def calculate_vertical_projection_overlap_ratio(block1, block2):
+    """
+    Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
+
+    Args:
+        block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
+        block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
+
+    Returns:
+        float: The proportion of the x-axis covered by the vertical projection of the two blocks.
+    """
+    x0_1, _, x1_1, _ = block1
+    x0_2, _, x1_2, _ = block2
+
+    # Calculate the intersection of the x-coordinates
+    x_left = max(x0_1, x0_2)
+    x_right = min(x1_1, x1_2)
+
+    if x_right < x_left:
+        return 0.0
+
+    # Length of the intersection
+    intersection_length = x_right - x_left
+
+    # Length of the x-axis projection of the first block
+    block1_length = x1_1 - x0_1
+
+    if block1_length == 0:
+        return 0.0
+
+    # Proportion of the x-axis covered by the intersection
+    # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
+    return intersection_length / block1_length
--- a/magic_pdf/libs/calc_span_stats.py
+++ b/magic_pdf/libs/calc_span_stats.py
+import os
+import csv
+import json
+import pandas as pd
+from pandas import DataFrame as df
+from matplotlib import pyplot as plt
+from termcolor import cprint
+
+"""
+Execute this script in the following way:
+
+1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
+
+    code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
+    
+2. Under the directory code-clean, execute the following command:
+
+    $ python -m libs.calc_span_stats
+    
+"""
+
+
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+
+
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+
+
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+
+
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+
+
+class SpanStatsCalc:
+    """Calculate statistics of span."""
+
+    def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
+        """Draw multiple figures in one figure."""
+        # make a canvas
+        fig = plt.figure(fig_num, figsize=(20, 20))
+
+        pass
+
+    def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
+        """Calculate statistics per pdf_dict."""
+        span_stats = pd.DataFrame()
+
+        span_stats = []
+        span_id = 0
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    for para_block in blocks["para_blocks"]:
+                        for line in para_block["lines"]:
+                            for span in line["spans"]:
+                                span_text = safe_get(span, "text", "")
+                                span_font_name = safe_get(span, "font", "")
+                                span_font_size = safe_get(span, "size", 0)
+                                span_font_color = safe_get(span, "color", "")
+                                span_font_flags = safe_get(span, "flags", 0)
+
+                                span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
+                                span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
+                                span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
+                                span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
+                                span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
+                                span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
+                                span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
+                                span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
+
+                                span_stats.append(
+                                    {
+                                        "span_id": span_id,  # id of span
+                                        "page_id": page_id,  # page number of pdf
+                                        "span_text": span_text,  # text of span
+                                        "span_font_name": span_font_name,  # font name of span
+                                        "span_font_size": span_font_size,  # font size of span
+                                        "span_font_color": span_font_color,  # font color of span
+                                        "span_font_flags": span_font_flags,  # font flags of span
+                                        "span_is_superscript": int(
+                                            span_is_super_script
+                                        ),  # indicate whether the span is super script or not
+                                        "span_is_italic": int(span_is_italic),  # indicate whether the span is italic or not
+                                        "span_is_serifed": int(span_is_serifed),  # indicate whether the span is serifed or not
+                                        "span_is_sans_serifed": int(
+                                            span_is_sans_serifed
+                                        ),  # indicate whether the span is sans serifed or not
+                                        "span_is_monospaced": int(
+                                            span_is_monospaced
+                                        ),  # indicate whether the span is monospaced or not
+                                        "span_is_proportional": int(
+                                            span_is_proportional
+                                        ),  # indicate whether the span is proportional or not
+                                        "span_is_bold": int(span_is_bold),  # indicate whether the span is bold or not
+                                    }
+                                )
+
+                                span_id += 1
+
+        span_stats = pd.DataFrame(span_stats)
+        # print(span_stats)
+
+        return span_stats
+
+
+def __find_pdf_dic_files(
+    jf_name="pdf_dic.json",
+    base_code_name="code-clean",
+    tgt_base_dir_name="tmp",
+    unittest_dir_name="unittest",
+    md_dir_name="md",
+    book_names=[
+        "scihub",
+    ],  # other possible values: "zlib", "arxiv" and so on
+):
+    pdf_dict_files = []
+
+    curr_dir = os.path.dirname(__file__)
+
+    for i in range(len(curr_dir)):
+        if curr_dir[i : i + len(base_code_name)] == base_code_name:
+            base_code_dir_name = curr_dir[: i + len(base_code_name)]
+            for book_name in book_names:
+                search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
+                if os.path.exists(base_code_dir_name):
+                    search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
+                    for root, dirs, files in os.walk(search_dir_name):
+                        for file in files:
+                            if file == jf_name:
+                                pdf_dict_files.append(os.path.join(root, file))
+                break
+
+    return pdf_dict_files
+
+
+def combine_span_texts(group_df, span_stats):
+    combined_span_texts = []
+    for _, row in group_df.iterrows():
+        curr_span_id = row.name
+        curr_span_text = row["span_text"]
+
+        pre_span_id = curr_span_id - 1
+        pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
+
+        next_span_id = curr_span_id + 1
+        next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
+
+        # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
+        pointer_sign = "→ → → "
+        combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
+        combined_span_texts.append(combined_text)
+
+    return "\n\n".join(combined_span_texts)
+
+
+# pd.set_option("display.max_colwidth", None)  # 设置为 None 来显示完整的文本
+pd.set_option("display.max_rows", None)  # 设置为 None 来显示更多的行
+
+
+def main():
+    pdf_dict_files = __find_pdf_dic_files()
+    # print(pdf_dict_files)
+
+    span_stats_calc = SpanStatsCalc()
+
+    for pdf_dict_file in pdf_dict_files:
+        print("-" * 100)
+        print_green_on_red(f"Processing {pdf_dict_file}")
+
+        with open(pdf_dict_file, "r", encoding="utf-8") as f:
+            pdf_dict = json.load(f)
+
+            raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
+            raw_df.to_csv(save_path, index=False)
+
+            filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
+            if filtered_df.empty:
+                print("No superscript span found!")
+                continue
+
+            filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+
+            combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df)  # type: ignore
+
+            final_df = filtered_grouped_df.size().reset_index(name="count")
+            final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
+
+            print(final_df)
+
+            final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
+
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
+            # 使用 UTF-8 编码并添加 BOM，确保所有字段被双引号包围
+            final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
+
+            # 创建一个 2x2 的图表布局
+            fig, axs = plt.subplots(2, 2, figsize=(15, 10))
+
+            # 按照 span_font_name 分类作图
+            final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
+
+            # 按照 span_font_size 分类作图
+            final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
+
+            # 按照 span_font_color 分类作图
+            final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
+
+            # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
+            grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+            grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
+
+            # 调整布局
+            plt.tight_layout()
+
+            # 显示图表
+            # plt.show()
+
+            # 保存图表到 PNG 文件
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
+            plt.savefig(save_path)
+
+            # 清除画布
+            plt.clf()
+
+
+if __name__ == "__main__":
+    main()
--- a/magic_pdf/libs/clean_memory.py
+++ b/magic_pdf/libs/clean_memory.py
+# Copyright (c) Opendatalab. All rights reserved.
+import torch
+import gc
+
+
+def clean_memory():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    gc.collect()
\ No newline at end of file
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
+import datetime
+import json
+import os, re, configparser
+import subprocess
+import time
+
+import boto3
+from loguru import logger
+from boto3.s3.transfer import TransferConfig
+from botocore.config import Config
+
+import fitz # 1.23.9中已经切换到rebase
+# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
+
+
+def get_delta_time(input_time):
+    return round(time.time() - input_time, 2)
+
+
+def join_path(*args):
+    return '/'.join(str(s).rstrip('/') for s in args)
+
+
+#配置全局的errlog_path，方便demo同步引用
+error_log_path = "s3://llm-pdf-text/err_logs/"
+# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
+json_dump_path = "s3://llm-pdf-text/json_dump/"
+
+# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
+
+
+def get_top_percent_list(num_list, percent):
+    """
+    获取列表中前百分之多少的元素
+    :param num_list:
+    :param percent:
+    :return:
+    """
+    if len(num_list) == 0:
+        top_percent_list = []
+    else:
+        # 对imgs_len_list排序
+        sorted_imgs_len_list = sorted(num_list, reverse=True)
+        # 计算 percent 的索引
+        top_percent_index = int(len(sorted_imgs_len_list) * percent)
+        # 取前80%的元素
+        top_percent_list = sorted_imgs_len_list[:top_percent_index]
+    return top_percent_list
+
+
+def formatted_time(time_stamp):
+    dt_object = datetime.datetime.fromtimestamp(time_stamp)
+    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
+    return output_time
+
+
+def mymax(alist: list):
+    if len(alist) == 0:
+        return 0  # 空是0， 0*0也是0大小q
+    else:
+        return max(alist)
+
+def parse_aws_param(profile):
+    if isinstance(profile, str):
+        # 解析配置文件
+        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
+        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
+        config = configparser.ConfigParser()
+        config.read(credentials_file)
+        config.read(config_file)
+        # 获取 AWS 账户相关信息
+        ak = config.get(profile, "aws_access_key_id")
+        sk = config.get(profile, "aws_secret_access_key")
+        if profile == "default":
+            s3_str = config.get(f"{profile}", "s3")
+        else:
+            s3_str = config.get(f"profile {profile}", "s3")
+        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if end_match:
+            endpoint = end_match.group(1)
+        else:
+            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
+        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if style_match:
+            addressing_style = style_match.group(1)
+        else:
+            addressing_style = "path"
+    elif isinstance(profile, dict):
+        ak = profile["ak"]
+        sk = profile["sk"]
+        endpoint = profile["endpoint"]
+        addressing_style = "auto"
+
+    return ak, sk, endpoint, addressing_style
+
+
+def parse_bucket_key(s3_full_path: str):
+    """
+    输入 s3://bucket/path/to/my/file.txt
+    输出 bucket, path/to/my/file.txt
+    """
+    s3_full_path = s3_full_path.strip()
+    if s3_full_path.startswith("s3://"):
+        s3_full_path = s3_full_path[5:]
+    if s3_full_path.startswith("/"):
+        s3_full_path = s3_full_path[1:]
+    bucket, key = s3_full_path.split("/", 1)
+    return bucket, key
+
+
+def read_file(pdf_path: str, s3_profile):
+    if pdf_path.startswith("s3://"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
+        bucket_name, bucket_key = parse_bucket_key(pdf_path)
+        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
+        file_content = res["Body"].read()
+        return file_content
+    else:
+        with open(pdf_path, "rb") as f:
+            return f.read()
+
+
+def get_docx_model_output(pdf_model_output, page_id):
+
+    model_output_json = pdf_model_output[page_id]
+
+    return model_output_json
+
+
+def list_dir(dir_path:str, s3_profile:str):
+    """
+    列出dir_path下的所有文件
+    """
+    ret = []
+    
+    if dir_path.startswith("s3"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
+        bucket, path = s3info[0][0], s3info[0][1]
+        try:
+            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                                            config=Config(s3={'addressing_style': addressing_style}))
+            def list_obj_scluster():
+                marker = None
+                while True:
+                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
+                    if marker:
+                        list_kwargs['Marker'] = marker
+                    response = cli.list_objects(**list_kwargs)
+                    contents = response.get("Contents", [])
+                    yield from contents
+                    if not response.get("IsTruncated") or len(contents)==0:
+                        break
+                    marker = contents[-1]['Key']
+
+
+            for info in list_obj_scluster():
+                file_path = info['Key']
+                #size = info['Size']
+
+                if path!="":
+                    afile = file_path[len(path):]
+                    if afile.endswith(".json"):
+                        ret.append(f"s3://{bucket}/{file_path}")
+                        
+            return ret
+
+        except Exception as e:
+            logger.exception(e)
+            exit(-1)
+    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
+        
+        for root, dirs, files in os.walk(dir_path):
+            for file in files:
+                if file.endswith(".json"):
+                    ret.append(join_path(root, file))
+        ret.sort()
+        return ret
+
+def get_img_s3_client(save_path:str, image_s3_config:str):
+    """
+    """
+    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
+        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
+        img_s3_client = boto3.client(
+            service_name="s3",
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=end_point,
+            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
+        )
+    else:
+        img_s3_client = None
+        
+    return img_s3_client
+
+if __name__=="__main__":
+    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
+    s3_profile = "langchao"
+    ret = list_dir(s3_path, s3_profile)
+    print(ret)
+    
\ No newline at end of file
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
+"""根据bucket的名字返回对应的s3 AK， SK，endpoint三元组."""
+
+import json
+import os
+
+from loguru import logger
+
+from magic_pdf.libs.Constants import MODEL_NAME
+from magic_pdf.libs.commons import parse_bucket_key
+
+# 定义配置文件名常量
+CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'magic-pdf.json')
+
+
+def read_config():
+    if os.path.isabs(CONFIG_FILE_NAME):
+        config_file = CONFIG_FILE_NAME
+    else:
+        home_dir = os.path.expanduser('~')
+        config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
+
+    if not os.path.exists(config_file):
+        raise FileNotFoundError(f'{config_file} not found')
+
+    with open(config_file, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    return config
+
+
+def get_s3_config(bucket_name: str):
+    """~/magic-pdf.json 读出来."""
+    config = read_config()
+
+    bucket_info = config.get('bucket_info')
+    if bucket_name not in bucket_info:
+        access_key, secret_key, storage_endpoint = bucket_info['[default]']
+    else:
+        access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
+
+    if access_key is None or secret_key is None or storage_endpoint is None:
+        raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}')
+
+    # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
+
+    return access_key, secret_key, storage_endpoint
+
+
+def get_s3_config_dict(path: str):
+    access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
+    return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint}
+
+
+def get_bucket_name(path):
+    bucket, key = parse_bucket_key(path)
+    return bucket
+
+
+def get_local_models_dir():
+    config = read_config()
+    models_dir = config.get('models-dir')
+    if models_dir is None:
+        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
+        return '/tmp/models'
+    else:
+        return models_dir
+
+
+def get_local_layoutreader_model_dir():
+    config = read_config()
+    layoutreader_model_dir = config.get('layoutreader-model-dir')
+    if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir):
+        home_dir = os.path.expanduser('~')
+        layoutreader_at_modelscope_dir_path = os.path.join(home_dir, '.cache/modelscope/hub/ppaanngggg/layoutreader')
+        logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default")
+        return layoutreader_at_modelscope_dir_path
+    else:
+        return layoutreader_model_dir
+
+
+def get_device():
+    config = read_config()
+    device = config.get('device-mode')
+    if device is None:
+        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
+        return 'cpu'
+    else:
+        return device
+
+
+def get_table_recog_config():
+    config = read_config()
+    table_config = config.get('table-config')
+    if table_config is None:
+        logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
+        return json.loads(f'{{"model": "{MODEL_NAME.TABLE_MASTER}","enable": false, "max_time": 400}}')
+    else:
+        return table_config
+
+
+def get_layout_config():
+    config = read_config()
+    layout_config = config.get("layout-config")
+    if layout_config is None:
+        logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
+        return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
+    else:
+        return layout_config
+
+
+def get_formula_config():
+    config = read_config()
+    formula_config = config.get("formula-config")
+    if formula_config is None:
+        logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
+        return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
+    else:
+        return formula_config
+
+
+if __name__ == "__main__":
+    ak, sk, endpoint = get_s3_config("llm-raw")
--- a/magic_pdf/libs/convert_utils.py
+++ b/magic_pdf/libs/convert_utils.py
+def dict_to_list(input_dict):
+    items_list = []
+    for _, item in input_dict.items():
+        items_list.append(item)
+    return items_list
--- a/magic_pdf/libs/coordinate_transform.py
+++ b/magic_pdf/libs/coordinate_transform.py
+def get_scale_ratio(model_page_info, page):
+    pix = page.get_pixmap(dpi=72)
+    pymu_width = int(pix.w)
+    pymu_height = int(pix.h)
+    width_from_json = model_page_info['page_info']['width']
+    height_from_json = model_page_info['page_info']['height']
+    horizontal_scale_ratio = width_from_json / pymu_width
+    vertical_scale_ratio = height_from_json / pymu_height
+    return horizontal_scale_ratio, vertical_scale_ratio
--- a/magic_pdf/libs/detect_language_from_model.py
+++ b/magic_pdf/libs/detect_language_from_model.py
+from collections import Counter
+
+from magic_pdf.libs.language import detect_lang
+
+def get_language_from_model(model_list: list):
+    language_lst = []
+    for ocr_page_info in model_list:
+        page_text = ""
+        layout_dets = ocr_page_info["layout_dets"]
+        for layout_det in layout_dets:
+            category_id = layout_det["category_id"]
+            allow_category_id_list = [15]
+            if category_id in allow_category_id_list:
+                page_text += layout_det["text"]
+        page_language = detect_lang(page_text)
+        language_lst.append(page_language)
+    # 统计text_language_list中每种语言的个数
+    count_dict = Counter(language_lst)
+    # 输出text_language_list中出现的次数最多的语言
+    language = max(count_dict, key=count_dict.get)
+    return language
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.libs.commons import fitz  # PyMuPDF
+from magic_pdf.libs.Constants import CROSS_PAGE
+from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
+from magic_pdf.model.magic_model import MagicModel
+
+
+def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for bbox in page_data:
+        x0, y0, x1, y1 = bbox
+        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+        if fill_config:
+            page.draw_rect(
+                rect_coords,
+                color=None,
+                fill=new_rgb,
+                fill_opacity=0.3,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        else:
+            page.draw_rect(
+                rect_coords,
+                color=new_rgb,
+                fill=None,
+                fill_opacity=1,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+
+
+def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for j, bbox in enumerate(page_data):
+        x0, y0, x1, y1 = bbox
+        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+        if draw_bbox:
+            if fill_config:
+                page.draw_rect(
+                    rect_coords,
+                    color=None,
+                    fill=new_rgb,
+                    fill_opacity=0.3,
+                    width=0.5,
+                    overlay=True,
+                )  # Draw the rectangle
+            else:
+                page.draw_rect(
+                    rect_coords,
+                    color=new_rgb,
+                    fill=None,
+                    fill_opacity=1,
+                    width=0.5,
+                    overlay=True,
+                )  # Draw the rectangle
+        page.insert_text(
+            (x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
+        )  # Insert the index in the top left corner of the rectangle
+
+
+def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
+    dropped_bbox_list = []
+    tables_list, tables_body_list = [], []
+    tables_caption_list, tables_footnote_list = [], []
+    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
+    imgs_footnote_list = []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    lists_list = []
+    indexs_list = []
+    for page in pdf_info:
+
+        page_dropped_list = []
+        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
+        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
+        titles = []
+        texts = []
+        interequations = []
+        lists = []
+        indices = []
+
+        for dropped_bbox in page['discarded_blocks']:
+            page_dropped_list.append(dropped_bbox['bbox'])
+        dropped_bbox_list.append(page_dropped_list)
+        for block in page['para_blocks']:
+            bbox = block['bbox']
+            if block['type'] == BlockType.Table:
+                tables.append(bbox)
+                for nested_block in block['blocks']:
+                    bbox = nested_block['bbox']
+                    if nested_block['type'] == BlockType.TableBody:
+                        tables_body.append(bbox)
+                    elif nested_block['type'] == BlockType.TableCaption:
+                        tables_caption.append(bbox)
+                    elif nested_block['type'] == BlockType.TableFootnote:
+                        tables_footnote.append(bbox)
+            elif block['type'] == BlockType.Image:
+                imgs.append(bbox)
+                for nested_block in block['blocks']:
+                    bbox = nested_block['bbox']
+                    if nested_block['type'] == BlockType.ImageBody:
+                        imgs_body.append(bbox)
+                    elif nested_block['type'] == BlockType.ImageCaption:
+                        imgs_caption.append(bbox)
+                    elif nested_block['type'] == BlockType.ImageFootnote:
+                        imgs_footnote.append(bbox)
+            elif block['type'] == BlockType.Title:
+                titles.append(bbox)
+            elif block['type'] == BlockType.Text:
+                texts.append(bbox)
+            elif block['type'] == BlockType.InterlineEquation:
+                interequations.append(bbox)
+            elif block['type'] == BlockType.List:
+                lists.append(bbox)
+            elif block['type'] == BlockType.Index:
+                indices.append(bbox)
+
+        tables_list.append(tables)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_list.append(imgs)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        imgs_footnote_list.append(imgs_footnote)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+        lists_list.append(lists)
+        indexs_list.append(indices)
+
+    layout_bbox_list = []
+
+    table_type_order = {
+        'table_caption': 1,
+        'table_body': 2,
+        'table_footnote': 3
+    }
+    for page in pdf_info:
+        page_block_list = []
+        for block in page['para_blocks']:
+            if block['type'] in [
+                BlockType.Text,
+                BlockType.Title,
+                BlockType.InterlineEquation,
+                BlockType.List,
+                BlockType.Index,
+            ]:
+                bbox = block['bbox']
+                page_block_list.append(bbox)
+            elif block['type'] in [BlockType.Image]:
+                for sub_block in block['blocks']:
+                    bbox = sub_block['bbox']
+                    page_block_list.append(bbox)
+            elif block['type'] in [BlockType.Table]:
+                sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
+                for sub_block in sorted_blocks:
+                    bbox = sub_block['bbox']
+                    page_block_list.append(bbox)
+
+        layout_bbox_list.append(page_block_list)
+
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+
+    for i, page in enumerate(pdf_docs):
+
+        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
+        # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True)  # color !
+        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
+        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
+        draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
+        # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
+        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
+        draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
+        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
+        draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
+        draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
+
+        draw_bbox_with_number(
+            i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
+        )
+
+    # Save the PDF
+    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
+
+
+def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
+    text_list = []
+    inline_equation_list = []
+    interline_equation_list = []
+    image_list = []
+    table_list = []
+    dropped_list = []
+    next_page_text_list = []
+    next_page_inline_equation_list = []
+
+    def get_span_info(span):
+        if span['type'] == ContentType.Text:
+            if span.get(CROSS_PAGE, False):
+                next_page_text_list.append(span['bbox'])
+            else:
+                page_text_list.append(span['bbox'])
+        elif span['type'] == ContentType.InlineEquation:
+            if span.get(CROSS_PAGE, False):
+                next_page_inline_equation_list.append(span['bbox'])
+            else:
+                page_inline_equation_list.append(span['bbox'])
+        elif span['type'] == ContentType.InterlineEquation:
+            page_interline_equation_list.append(span['bbox'])
+        elif span['type'] == ContentType.Image:
+            page_image_list.append(span['bbox'])
+        elif span['type'] == ContentType.Table:
+            page_table_list.append(span['bbox'])
+
+    for page in pdf_info:
+        page_text_list = []
+        page_inline_equation_list = []
+        page_interline_equation_list = []
+        page_image_list = []
+        page_table_list = []
+        page_dropped_list = []
+
+        # 将跨页的span放到移动到下一页的列表中
+        if len(next_page_text_list) > 0:
+            page_text_list.extend(next_page_text_list)
+            next_page_text_list.clear()
+        if len(next_page_inline_equation_list) > 0:
+            page_inline_equation_list.extend(next_page_inline_equation_list)
+            next_page_inline_equation_list.clear()
+
+        # 构造dropped_list
+        for block in page['discarded_blocks']:
+            if block['type'] == BlockType.Discarded:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        page_dropped_list.append(span['bbox'])
+        dropped_list.append(page_dropped_list)
+        # 构造其余useful_list
+        for block in page['para_blocks']:
+            if block['type'] in [
+                BlockType.Text,
+                BlockType.Title,
+                BlockType.InterlineEquation,
+                BlockType.List,
+                BlockType.Index,
+            ]:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        get_span_info(span)
+            elif block['type'] in [BlockType.Image, BlockType.Table]:
+                for sub_block in block['blocks']:
+                    for line in sub_block['lines']:
+                        for span in line['spans']:
+                            get_span_info(span)
+        text_list.append(page_text_list)
+        inline_equation_list.append(page_inline_equation_list)
+        interline_equation_list.append(page_interline_equation_list)
+        image_list.append(page_image_list)
+        table_list.append(page_table_list)
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        # 获取当前页面的数据
+        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
+        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
+        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
+        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
+        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
+        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
+
+    # Save the PDF
+    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
+
+
+def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
+    dropped_bbox_list = []
+    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
+    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
+    for i in range(len(model_list)):
+        page_dropped_list = []
+        tables_body, tables_caption, tables_footnote = [], [], []
+        imgs_body, imgs_caption, imgs_footnote = [], [], []
+        titles = []
+        texts = []
+        interequations = []
+        page_info = magic_model.get_model_list(i)
+        layout_dets = page_info['layout_dets']
+        for layout_det in layout_dets:
+            bbox = layout_det['bbox']
+            if layout_det['category_id'] == CategoryId.Text:
+                texts.append(bbox)
+            elif layout_det['category_id'] == CategoryId.Title:
+                titles.append(bbox)
+            elif layout_det['category_id'] == CategoryId.TableBody:
+                tables_body.append(bbox)
+            elif layout_det['category_id'] == CategoryId.TableCaption:
+                tables_caption.append(bbox)
+            elif layout_det['category_id'] == CategoryId.TableFootnote:
+                tables_footnote.append(bbox)
+            elif layout_det['category_id'] == CategoryId.ImageBody:
+                imgs_body.append(bbox)
+            elif layout_det['category_id'] == CategoryId.ImageCaption:
+                imgs_caption.append(bbox)
+            elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
+                interequations.append(bbox)
+            elif layout_det['category_id'] == CategoryId.Abandon:
+                page_dropped_list.append(bbox)
+            elif layout_det['category_id'] == CategoryId.ImageFootnote:
+                imgs_footnote.append(bbox)
+
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+        dropped_bbox_list.append(page_dropped_list)
+        imgs_footnote_list.append(imgs_footnote)
+
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(
+            i, dropped_bbox_list, page, [158, 158, 158], True
+        )  # color !
+        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
+        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
+        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
+        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
+        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
+        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
+
+    # Save the PDF
+    pdf_docs.save(f'{out_path}/{filename}_model.pdf')
+
+
+def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
+    layout_bbox_list = []
+
+    for page in pdf_info:
+        page_line_list = []
+        for block in page['preproc_blocks']:
+            if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
+                for line in block['lines']:
+                    bbox = line['bbox']
+                    index = line['index']
+                    page_line_list.append({'index': index, 'bbox': bbox})
+            if block['type'] in [BlockType.Image, BlockType.Table]:
+                for sub_block in block['blocks']:
+                    if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
+                        for line in sub_block['virtual_lines']:
+                            bbox = line['bbox']
+                            index = line['index']
+                            page_line_list.append({'index': index, 'bbox': bbox})
+                    elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
+                        for line in sub_block['lines']:
+                            bbox = line['bbox']
+                            index = line['index']
+                            page_line_list.append({'index': index, 'bbox': bbox})
+        sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
+        layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
+
+    pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
+
+
+def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
+    layout_bbox_list = []
+
+    for page in pdf_info:
+        page_block_list = []
+        for block in page['para_blocks']:
+            bbox = block['bbox']
+            page_block_list.append(bbox)
+        layout_bbox_list.append(page_block_list)
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
+
+    pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')
--- a/magic_pdf/libs/drop_reason.py
+++ b/magic_pdf/libs/drop_reason.py
+
+class DropReason:
+    TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖，导致无法准确定位文字顺序
+    USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
+    COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局，暂时不支持
+    TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
+    COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
+    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷，当前方法下计算量消耗过大
+    MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
+    Exception = "_exception" # 解析中发生异常
+    ENCRYPTED = "encrypted" # PDF是加密的
+    EMPTY_PDF = "total_page=0" # PDF页面总数为0
+    NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF，无法直接解析
+    DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
+    TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
+    TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败（例如一级、二级、三级标题）
+    PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
+    PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
+    NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
+    SPECIAL_PDF = "special_pdf"
+    PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
+    CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
+    NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
+    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
+    
\ No newline at end of file
--- a/magic_pdf/libs/drop_tag.py
+++ b/magic_pdf/libs/drop_tag.py
+
+COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
+PAGE_NO = "page-no" # 页码
+CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
+VERTICAL_TEXT = 'vertical-text' # 垂直文本
+ROTATE_TEXT = 'rotate-text' # 旋转文本
+EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
+ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
+ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
+
+
+class DropTag:
+    PAGE_NUMBER = "page_no"
+    HEADER = "header"
+    FOOTER = "footer"
+    FOOTNOTE = "footnote"
+    NOT_IN_LAYOUT = "not_in_layout"
+    SPAN_OVERLAP = "span_overlap"
+    BLOCK_OVERLAP = "block_overlap"
--- a/magic_pdf/libs/hash_utils.py
+++ b/magic_pdf/libs/hash_utils.py
+import hashlib
+
+
+def compute_md5(file_bytes):
+    hasher = hashlib.md5()
+    hasher.update(file_bytes)
+    return hasher.hexdigest().upper()
+
+
+def compute_sha256(input_string):
+    hasher = hashlib.sha256()
+    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
+    input_bytes = input_string.encode('utf-8')
+    hasher.update(input_bytes)
+    return hasher.hexdigest()
--- a/magic_pdf/libs/json_compressor.py
+++ b/magic_pdf/libs/json_compressor.py
+import json
+import brotli
+import base64
+
+class JsonCompressor:
+
+    @staticmethod
+    def compress_json(data):
+        """
+        Compress a json object and encode it with base64
+        """
+        json_str = json.dumps(data)
+        json_bytes = json_str.encode('utf-8')
+        compressed = brotli.compress(json_bytes, quality=6)
+        compressed_str = base64.b64encode(compressed).decode('utf-8')  # convert bytes to string
+        return compressed_str
+
+    @staticmethod
+    def decompress_json(compressed_str):
+        """
+        Decode the base64 string and decompress the json object
+        """
+        compressed = base64.b64decode(compressed_str.encode('utf-8'))  # convert string to bytes
+        decompressed_bytes = brotli.decompress(compressed)
+        json_str = decompressed_bytes.decode('utf-8')
+        data = json.loads(json_str)
+        return data
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
+import os
+import unicodedata
+
+if not os.getenv("FTLANG_CACHE"):
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    root_dir = os.path.dirname(current_dir)
+    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
+    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
+    # print(os.getenv("FTLANG_CACHE"))
+
+from fast_langdetect import detect_language
+
+
+def detect_lang(text: str) -> str:
+
+    if len(text) == 0:
+        return ""
+    try:
+        lang_upper = detect_language(text)
+    except:
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_language(html_no_ctrl_chars)
+    try:
+        lang = lang_upper.lower()
+    except:
+        lang = ""
+    return lang
+
+
+if __name__ == '__main__':
+    print(os.getenv("FTLANG_CACHE"))
+    print(detect_lang("This is a test."))
+    print(detect_lang("<html>This is a test</html>"))
+    print(detect_lang("这个是中文测试。"))
+    print(detect_lang("<html>这个是中文测试。</html>"))
--- a/magic_pdf/libs/local_math.py
+++ b/magic_pdf/libs/local_math.py
+def float_gt(a, b):
+    if 0.0001 >= abs(a -b):
+        return False
+    return a > b
+    
+def float_equal(a, b):
+    if 0.0001 >= abs(a-b):
+        return True
+    return False
\ No newline at end of file
--- a/magic_pdf/libs/markdown_utils.py
+++ b/magic_pdf/libs/markdown_utils.py
+import re
+
+
+def escape_special_markdown_char(pymu_blocks):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for blk in pymu_blocks:
+        for line in blk['lines']:
+            for span in line['spans']:
+                for char in special_chars:
+                    span_text = span['text']
+                    span_type = span.get("_type", None)
+                    if span_type in ['inline-equation', 'interline-equation']:
+                        continue
+                    elif span_text:
+                        span['text'] = span['text'].replace(char, "\\" + char)
+
+    return pymu_blocks
+
+
+def ocr_escape_special_markdown_char(content):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for char in special_chars:
+        content = content.replace(char, "\\" + char)
+
+    return content