Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files

Deleted magic_pdf/pycache/init.cpython-310.pyc,...
Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
826086d2 · zhougaofeng · 57aaa1cf · 57aaa1cf · 57aaa1cf · 57aaa1cf
Commit 826086d2 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
-from magic_pdf.libs.commons import fitz  # PyMuPDF
-from magic_pdf.libs.Constants import CROSS_PAGE
-from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
-from magic_pdf.model.magic_model import MagicModel
-
-
-def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
-    new_rgb = []
-    for item in rgb_config:
-        item = float(item) / 255
-        new_rgb.append(item)
-    page_data = bbox_list[i]
-    for bbox in page_data:
-        x0, y0, x1, y1 = bbox
-        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
-        if fill_config:
-            page.draw_rect(
-                rect_coords,
-                color=None,
-                fill=new_rgb,
-                fill_opacity=0.3,
-                width=0.5,
-                overlay=True,
-            )  # Draw the rectangle
-        else:
-            page.draw_rect(
-                rect_coords,
-                color=new_rgb,
-                fill=None,
-                fill_opacity=1,
-                width=0.5,
-                overlay=True,
-            )  # Draw the rectangle
-
-
-def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
-    new_rgb = []
-    for item in rgb_config:
-        item = float(item) / 255
-        new_rgb.append(item)
-    page_data = bbox_list[i]
-    for j, bbox in enumerate(page_data):
-        x0, y0, x1, y1 = bbox
-        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
-        if fill_config:
-            page.draw_rect(
-                rect_coords,
-                color=None,
-                fill=new_rgb,
-                fill_opacity=0.3,
-                width=0.5,
-                overlay=True,
-            )  # Draw the rectangle
-        else:
-            page.draw_rect(
-                rect_coords,
-                color=new_rgb,
-                fill=None,
-                fill_opacity=1,
-                width=0.5,
-                overlay=True,
-            )  # Draw the rectangle
-        page.insert_text(
-            (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
-        )  # Insert the index in the top left corner of the rectangle
-
-
-def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
-    layout_bbox_list = []
-    dropped_bbox_list = []
-    tables_list, tables_body_list = [], []
-    tables_caption_list, tables_footnote_list = [], []
-    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
-    imgs_footnote_list = []
-    titles_list = []
-    texts_list = []
-    interequations_list = []
-    for page in pdf_info:
-        page_layout_list = []
-        page_dropped_list = []
-        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
-        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
-        titles = []
-        texts = []
-        interequations = []
-        for layout in page['layout_bboxes']:
-            page_layout_list.append(layout['layout_bbox'])
-        layout_bbox_list.append(page_layout_list)
-        for dropped_bbox in page['discarded_blocks']:
-            page_dropped_list.append(dropped_bbox['bbox'])
-        dropped_bbox_list.append(page_dropped_list)
-        for block in page['para_blocks']:
-            bbox = block['bbox']
-            if block['type'] == BlockType.Table:
-                tables.append(bbox)
-                for nested_block in block['blocks']:
-                    bbox = nested_block['bbox']
-                    if nested_block['type'] == BlockType.TableBody:
-                        tables_body.append(bbox)
-                    elif nested_block['type'] == BlockType.TableCaption:
-                        tables_caption.append(bbox)
-                    elif nested_block['type'] == BlockType.TableFootnote:
-                        tables_footnote.append(bbox)
-            elif block['type'] == BlockType.Image:
-                imgs.append(bbox)
-                for nested_block in block['blocks']:
-                    bbox = nested_block['bbox']
-                    if nested_block['type'] == BlockType.ImageBody:
-                        imgs_body.append(bbox)
-                    elif nested_block['type'] == BlockType.ImageCaption:
-                        imgs_caption.append(bbox)
-                    elif nested_block['type'] == BlockType.ImageFootnote:
-                        imgs_footnote.append(bbox)
-            elif block['type'] == BlockType.Title:
-                titles.append(bbox)
-            elif block['type'] == BlockType.Text:
-                texts.append(bbox)
-            elif block['type'] == BlockType.InterlineEquation:
-                interequations.append(bbox)
-        tables_list.append(tables)
-        tables_body_list.append(tables_body)
-        tables_caption_list.append(tables_caption)
-        tables_footnote_list.append(tables_footnote)
-        imgs_list.append(imgs)
-        imgs_body_list.append(imgs_body)
-        imgs_caption_list.append(imgs_caption)
-        imgs_footnote_list.append(imgs_footnote)
-        titles_list.append(titles)
-        texts_list.append(texts)
-        interequations_list.append(interequations)
-
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    for i, page in enumerate(pdf_docs):
-        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
-        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
-                                 True)
-        draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
-                                 True)  # color !
-        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
-                                 True)
-        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
-                                 True)
-        draw_bbox_without_number(i, tables_footnote_list, page,
-                                 [229, 255, 204], True)
-        draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
-        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
-        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
-                                 True)
-        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
-                              True),
-        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
-        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
-        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
-                                 True)
-
-    # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
-
-
-def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
-    text_list = []
-    inline_equation_list = []
-    interline_equation_list = []
-    image_list = []
-    table_list = []
-    dropped_list = []
-    next_page_text_list = []
-    next_page_inline_equation_list = []
-
-    def get_span_info(span):
-        if span['type'] == ContentType.Text:
-            if span.get(CROSS_PAGE, False):
-                next_page_text_list.append(span['bbox'])
-            else:
-                page_text_list.append(span['bbox'])
-        elif span['type'] == ContentType.InlineEquation:
-            if span.get(CROSS_PAGE, False):
-                next_page_inline_equation_list.append(span['bbox'])
-            else:
-                page_inline_equation_list.append(span['bbox'])
-        elif span['type'] == ContentType.InterlineEquation:
-            page_interline_equation_list.append(span['bbox'])
-        elif span['type'] == ContentType.Image:
-            page_image_list.append(span['bbox'])
-        elif span['type'] == ContentType.Table:
-            page_table_list.append(span['bbox'])
-
-    for page in pdf_info:
-        page_text_list = []
-        page_inline_equation_list = []
-        page_interline_equation_list = []
-        page_image_list = []
-        page_table_list = []
-        page_dropped_list = []
-
-        # 将跨页的span放到移动到下一页的列表中
-        if len(next_page_text_list) > 0:
-            page_text_list.extend(next_page_text_list)
-            next_page_text_list.clear()
-        if len(next_page_inline_equation_list) > 0:
-            page_inline_equation_list.extend(next_page_inline_equation_list)
-            next_page_inline_equation_list.clear()
-
-        # 构造dropped_list
-        for block in page['discarded_blocks']:
-            if block['type'] == BlockType.Discarded:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        page_dropped_list.append(span['bbox'])
-        dropped_list.append(page_dropped_list)
-        # 构造其余useful_list
-        for block in page['para_blocks']:
-            if block['type'] in [
-                    BlockType.Text,
-                    BlockType.Title,
-                    BlockType.InterlineEquation,
-            ]:
-                for line in block['lines']:
-                    for span in line['spans']:
-                        get_span_info(span)
-            elif block['type'] in [BlockType.Image, BlockType.Table]:
-                for sub_block in block['blocks']:
-                    for line in sub_block['lines']:
-                        for span in line['spans']:
-                            get_span_info(span)
-        text_list.append(page_text_list)
-        inline_equation_list.append(page_inline_equation_list)
-        interline_equation_list.append(page_interline_equation_list)
-        image_list.append(page_image_list)
-        table_list.append(page_table_list)
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    for i, page in enumerate(pdf_docs):
-        # 获取当前页面的数据
-        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
-        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
-                                 False)
-        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
-                                 False)
-        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
-        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
-        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
-
-    # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
-
-
-def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
-    dropped_bbox_list = []
-    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
-    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
-    titles_list = []
-    texts_list = []
-    interequations_list = []
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    magic_model = MagicModel(model_list, pdf_docs)
-    for i in range(len(model_list)):
-        page_dropped_list = []
-        tables_body, tables_caption, tables_footnote = [], [], []
-        imgs_body, imgs_caption, imgs_footnote = [], [], []
-        titles = []
-        texts = []
-        interequations = []
-        page_info = magic_model.get_model_list(i)
-        layout_dets = page_info['layout_dets']
-        for layout_det in layout_dets:
-            bbox = layout_det['bbox']
-            if layout_det['category_id'] == CategoryId.Text:
-                texts.append(bbox)
-            elif layout_det['category_id'] == CategoryId.Title:
-                titles.append(bbox)
-            elif layout_det['category_id'] == CategoryId.TableBody:
-                tables_body.append(bbox)
-            elif layout_det['category_id'] == CategoryId.TableCaption:
-                tables_caption.append(bbox)
-            elif layout_det['category_id'] == CategoryId.TableFootnote:
-                tables_footnote.append(bbox)
-            elif layout_det['category_id'] == CategoryId.ImageBody:
-                imgs_body.append(bbox)
-            elif layout_det['category_id'] == CategoryId.ImageCaption:
-                imgs_caption.append(bbox)
-            elif layout_det[
-                    'category_id'] == CategoryId.InterlineEquation_YOLO:
-                interequations.append(bbox)
-            elif layout_det['category_id'] == CategoryId.Abandon:
-                page_dropped_list.append(bbox)
-            elif layout_det['category_id'] == CategoryId.ImageFootnote:
-                imgs_footnote.append(bbox)
-
-        tables_body_list.append(tables_body)
-        tables_caption_list.append(tables_caption)
-        tables_footnote_list.append(tables_footnote)
-        imgs_body_list.append(imgs_body)
-        imgs_caption_list.append(imgs_caption)
-        titles_list.append(titles)
-        texts_list.append(texts)
-        interequations_list.append(interequations)
-        dropped_bbox_list.append(page_dropped_list)
-        imgs_footnote_list.append(imgs_footnote)
-
-    for i, page in enumerate(pdf_docs):
-        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
-                              True)  # color !
-        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
-        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
-                              True)
-        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
-                              True)
-        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
-        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
-                              True)
-        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
-                              True)
-        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
-        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
-        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
-
-    # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}_model.pdf')
--- a/magic_pdf/libs/drop_reason.py
+++ b/magic_pdf/libs/drop_reason.py
-
-class DropReason:
-    TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖，导致无法准确定位文字顺序
-    USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
-    COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局，暂时不支持
-    TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
-    COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
-    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片，计算量太大，从而丢弃
-    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图，计算量太大，从而丢弃
-    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷，当前方法下计算量消耗过大
-    MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
-    Exception = "_exception" # 解析中发生异常
-    ENCRYPTED = "encrypted" # PDF是加密的
-    EMPTY_PDF = "total_page=0" # PDF页面总数为0
-    NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF，无法直接解析
-    DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
-    TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
-    TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败（例如一级、二级、三级标题）
-    PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
-    PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
-    NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
-    SPECIAL_PDF = "special_pdf"
-    PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
-    CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
-    NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
-    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
-    
\ No newline at end of file
--- a/magic_pdf/libs/drop_tag.py
+++ b/magic_pdf/libs/drop_tag.py
-
-COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
-PAGE_NO = "page-no" # 页码
-CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
-VERTICAL_TEXT = 'vertical-text' # 垂直文本
-ROTATE_TEXT = 'rotate-text' # 旋转文本
-EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
-ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
-ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
-
-
-class DropTag:
-    PAGE_NUMBER = "page_no"
-    HEADER = "header"
-    FOOTER = "footer"
-    FOOTNOTE = "footnote"
-    NOT_IN_LAYOUT = "not_in_layout"
-    SPAN_OVERLAP = "span_overlap"
-    BLOCK_OVERLAP = "block_overlap"
--- a/magic_pdf/libs/hash_utils.py
+++ b/magic_pdf/libs/hash_utils.py
-import hashlib
-
-
-def compute_md5(file_bytes):
-    hasher = hashlib.md5()
-    hasher.update(file_bytes)
-    return hasher.hexdigest().upper()
-
-
-def compute_sha256(input_string):
-    hasher = hashlib.sha256()
-    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
-    input_bytes = input_string.encode('utf-8')
-    hasher.update(input_bytes)
-    return hasher.hexdigest()
--- a/magic_pdf/libs/json_compressor.py
+++ b/magic_pdf/libs/json_compressor.py
-import json
-import brotli
-import base64
-
-class JsonCompressor:
-
-    @staticmethod
-    def compress_json(data):
-        """
-        Compress a json object and encode it with base64
-        """
-        json_str = json.dumps(data)
-        json_bytes = json_str.encode('utf-8')
-        compressed = brotli.compress(json_bytes, quality=6)
-        compressed_str = base64.b64encode(compressed).decode('utf-8')  # convert bytes to string
-        return compressed_str
-
-    @staticmethod
-    def decompress_json(compressed_str):
-        """
-        Decode the base64 string and decompress the json object
-        """
-        compressed = base64.b64decode(compressed_str.encode('utf-8'))  # convert string to bytes
-        decompressed_bytes = brotli.decompress(compressed)
-        json_str = decompressed_bytes.decode('utf-8')
-        data = json.loads(json_str)
-        return data
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
-import os
-import unicodedata
-
-if not os.getenv("FTLANG_CACHE"):
-    current_file_path = os.path.abspath(__file__)
-    current_dir = os.path.dirname(current_file_path)
-    root_dir = os.path.dirname(current_dir)
-    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
-    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
-    # print(os.getenv("FTLANG_CACHE"))
-
-from fast_langdetect import detect_language
-
-
-def detect_lang(text: str) -> str:
-
-    if len(text) == 0:
-        return ""
-    try:
-        lang_upper = detect_language(text)
-    except:
-        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
-        lang_upper = detect_language(html_no_ctrl_chars)
-    try:
-        lang = lang_upper.lower()
-    except:
-        lang = ""
-    return lang
-
-
-if __name__ == '__main__':
-    print(os.getenv("FTLANG_CACHE"))
-    print(detect_lang("This is a test."))
-    print(detect_lang("<html>This is a test</html>"))
-    print(detect_lang("这个是中文测试。"))
-    print(detect_lang("<html>这个是中文测试。</html>"))
--- a/magic_pdf/libs/local_math.py
+++ b/magic_pdf/libs/local_math.py
-def float_gt(a, b):
-    if 0.0001 >= abs(a -b):
-        return False
-    return a > b
-    
-def float_equal(a, b):
-    if 0.0001 >= abs(a-b):
-        return True
-    return False
\ No newline at end of file
--- a/magic_pdf/libs/markdown_utils.py
+++ b/magic_pdf/libs/markdown_utils.py
-import re
-
-
-def escape_special_markdown_char(pymu_blocks):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for blk in pymu_blocks:
-        for line in blk['lines']:
-            for span in line['spans']:
-                for char in special_chars:
-                    span_text = span['text']
-                    span_type = span.get("_type", None)
-                    if span_type in ['inline-equation', 'interline-equation']:
-                        continue
-                    elif span_text:
-                        span['text'] = span['text'].replace(char, "\\" + char)
-
-    return pymu_blocks
-
-
-def ocr_escape_special_markdown_char(content):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for char in special_chars:
-        content = content.replace(char, "\\" + char)
-
-    return content
--- a/magic_pdf/libs/nlp_utils.py
+++ b/magic_pdf/libs/nlp_utils.py
-import re
-from os import path
-
-from collections import Counter
-
-from loguru import logger
-
-# from langdetect import detect
-import spacy
-import en_core_web_sm
-import zh_core_web_sm
-
-from magic_pdf.libs.language import detect_lang
-
-
-class NLPModels:
-    """
-    How to upload local models to s3:
-        - config aws cli:
-            doc\SETUP-CLI.md
-            doc\setup_cli.sh
-            app\config\__init__.py
-        - $ cd {local_dir_storing_models}
-        - $ ls models
-            en_core_web_sm-3.7.1/
-            zh_core_web_sm-3.7.0/
-        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
-        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
-            PRE en_core_web_sm-3.7.1/
-            PRE zh_core_web_sm-3.7.0/
-    """
-
-    def __init__(self):
-        # if OS is windows, set "TMP_DIR" to "D:/tmp"
-
-        home_dir = path.expanduser("~")
-        self.default_local_path = path.join(home_dir, ".nlp_models")
-        self.default_shared_path = "/share/pdf_processor/nlp_models"
-        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
-        self.default_s3_path = "s3://llm-infra/models"
-        self.nlp_models = self.nlp_models = {
-            "en_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "zh_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-        }
-        self.en_core_web_sm_model = en_core_web_sm.load()
-        self.zh_core_web_sm_model = zh_core_web_sm.load()
-
-    def load_model(self, model_name, model_type, model_version):
-        if (
-            model_name in self.nlp_models
-            and self.nlp_models[model_name]["type"] == model_type
-            and self.nlp_models[model_name]["version"] == model_version
-        ):
-            return spacy.load(model_name) if spacy.util.is_package(model_name) else None
-
-        else:
-            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
-            return None
-
-    def detect_language(self, text, use_langdetect=False):
-        if len(text) == 0:
-            return None
-        if use_langdetect:
-            # print("use_langdetect")
-            # print(detect_lang(text))
-            # return detect_lang(text)
-            if detect_lang(text) == "zh":
-                return "zh"
-            else:
-                return "en"
-
-        if not use_langdetect:
-            en_count = len(re.findall(r"[a-zA-Z]", text))
-            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
-
-            if en_count > cn_count:
-                return "en"
-
-            if cn_count > en_count:
-                return "zh"
-
-    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
-        """
-        Detect entity categories using NLP models and return the most frequent entity types.
-
-        Parameters
-        ----------
-        text : str
-            Text to be processed.
-
-        Returns
-        -------
-        str
-            The most frequent entity type.
-        """
-        lang = self.detect_language(text, use_langdetect=True)
-
-        if lang == "en":
-            nlp_model = self.en_core_web_sm_model
-        elif lang == "zh":
-            nlp_model = self.zh_core_web_sm_model
-        else:
-            # logger.error(f"Unsupported language: {lang}")
-            return {}
-
-        # Splitting text into smaller parts
-        text_parts = re.split(r"[,;，；、\s & |]+", text)
-
-        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
-        text_combined = " ".join(text_parts)
-
-        try:
-            doc = nlp_model(text_combined)
-            entity_counts = Counter([ent.label_ for ent in doc.ents])
-            word_counts_in_entities = Counter()
-
-            for ent in doc.ents:
-                word_counts_in_entities[ent.label_] += len(ent.text.split())
-
-            total_words_in_entities = sum(word_counts_in_entities.values())
-            total_words = len([token for token in doc if not token.is_punct])
-
-            if total_words_in_entities == 0 or total_words == 0:
-                return None
-
-            entity_percentage = total_words_in_entities / total_words
-            if entity_percentage < 0.5:
-                return None
-
-            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
-            entity_percentage = word_count / total_words_in_entities
-
-            if entity_percentage >= threshold:
-                return most_common_entity
-            else:
-                return None
-        except Exception as e:
-            logger.error(f"Error in entity detection: {e}")
-            return None
-
-
-def __main__():
-    nlpModel = NLPModels()
-
-    test_strings = [
-        "张三",
-        "张三, 李四，王五; 赵六",
-        "John Doe",
-        "Jane Smith",
-        "Lee, John",
-        "John Doe, Jane Smith; Alice Johnson，Bob Lee",
-        "孙七, Michael Jordan；赵八",
-        "David Smith  Michael O'Connor; Kevin ßáçøñ",
-        "李雷·韩梅梅, 张三·李四",
-        "Charles Robert Darwin, Isaac Newton",
-        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
-        "John Doe, Jane Smith; Alice Johnson",
-        "张三, 李四，王五; 赵六",
-        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
-        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
-        "Claire Chabut* and Jean-François Bussières",
-        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
-        "Changchun",
-        "china",
-        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
-        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
-        "Synergistic Effect of Supported Nickel Catalyst with",
-        "Intumescent Flame-Retardants on Flame Retardancy",
-        "and Thermal Stability of Polypropylene",
-    ]
-
-    for test in test_strings:
-        print()
-        print(f"Original String: {test}")
-
-        result = nlpModel.detect_entity_catgr_using_nlp(test)
-        print(f"Detected entities: {result}")
-
-
-if __name__ == "__main__":
-    __main__()
--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
-class ContentType:
-    Image = 'image'
-    Table = 'table'
-    Text = 'text'
-    InlineEquation = 'inline_equation'
-    InterlineEquation = 'interline_equation'
-
-
-class BlockType:
-    Image = 'image'
-    ImageBody = 'image_body'
-    ImageCaption = 'image_caption'
-    ImageFootnote = 'image_footnote'
-    Table = 'table'
-    TableBody = 'table_body'
-    TableCaption = 'table_caption'
-    TableFootnote = 'table_footnote'
-    Text = 'text'
-    Title = 'title'
-    InterlineEquation = 'interline_equation'
-    Footnote = 'footnote'
-    Discarded = 'discarded'
-
-
-class CategoryId:
-    Title = 0
-    Text = 1
-    Abandon = 2
-    ImageBody = 3
-    ImageCaption = 4
-    TableBody = 5
-    TableCaption = 6
-    TableFootnote = 7
-    InterlineEquation_Layout = 8
-    InlineEquation = 13
-    InterlineEquation_YOLO = 14
-    OcrText = 15
-    ImageFootnote = 101
--- a/magic_pdf/libs/path_utils.py
+++ b/magic_pdf/libs/path_utils.py
-
-
-def remove_non_official_s3_args(s3path):
-    """
-    example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
-    """
-    arr = s3path.split("?")
-    return arr[0]
-
-def parse_s3path(s3path: str):
-    # from s3pathlib import S3Path
-    # p = S3Path(remove_non_official_s3_args(s3path))
-    # return p.bucket, p.key
-    s3path = remove_non_official_s3_args(s3path).strip()
-    if s3path.startswith(('s3://', 's3a://')):
-        prefix, path = s3path.split('://', 1)
-        bucket_name, key = path.split('/', 1)
-        return bucket_name, key
-    elif s3path.startswith('/'):
-        raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
-    else:
-        raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
-
-
-def parse_s3_range_params(s3path: str):
-    """
-    example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
-    """
-    arr = s3path.split("?bytes=")
-    if len(arr) == 1:
-        return None
-    return arr[1].split(",")
--- a/magic_pdf/libs/pdf_check.py
+++ b/magic_pdf/libs/pdf_check.py
-from io import BytesIO
-import re
-import fitz
-import numpy as np
-from loguru import logger
-from pdfminer.high_level import extract_text
-
-
-def calculate_sample_count(total_page: int):
-    """
-    根据总页数和采样率计算采样页面的数量。
-    """
-    select_page_cnt = min(10, total_page)
-    return select_page_cnt
-
-
-def extract_pages(src_pdf_bytes: bytes):
-    pdf_docs = fitz.open("pdf", src_pdf_bytes)
-    total_page = len(pdf_docs)
-    if total_page == 0:
-        # 如果PDF没有页面，直接返回空文档
-        logger.warning("PDF is empty, return empty document")
-        return fitz.Document()
-    select_page_cnt = calculate_sample_count(total_page)
-
-    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
-    sample_docs = fitz.Document()
-    try:
-        for index in page_num:
-            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
-    except Exception as e:
-        logger.exception(e)
-    return sample_docs
-
-
-def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
-    """"
-    检测PDF中是否包含非法字符
-    """
-    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
-    sample_docs = extract_pages(src_pdf_bytes)
-    sample_pdf_bytes = sample_docs.tobytes()
-    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-    text = extract_text(sample_pdf_file_like_object)
-    text = text.replace("\n", "")
-    # logger.info(text)
-    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
-    cid_pattern = re.compile(r'\(cid:\d+\)')
-    matches = cid_pattern.findall(text)
-    cid_count = len(matches)
-    cid_len = sum(len(match) for match in matches)
-    text_len = len(text)
-    if text_len == 0:
-        cid_chars_radio = 0
-    else:
-        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
-    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
-    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
-    if cid_chars_radio > 0.05:
-        return False  # 乱码文档
-    else:
-        return True   # 正常文档
--- a/magic_pdf/libs/pdf_image_tools.py
+++ b/magic_pdf/libs/pdf_image_tools.py
-
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.libs.commons import fitz
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.hash_utils import compute_sha256
-
-
-def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
-    """
-    从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径
-    save_path：需要同时支持s3和本地, 图片存放在save_path下，文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
-    """
-    # 拼接文件名
-    filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
-
-    # 老版本返回不带bucket的路径
-    img_path = join_path(return_path, filename) if return_path is not None else None
-
-    # 新版本生成平铺路径
-    img_hash256_path = f"{compute_sha256(img_path)}.jpg"
-
-    # 将坐标转换为fitz.Rect对象
-    rect = fitz.Rect(*bbox)
-    # 配置缩放倍数为3倍
-    zoom = fitz.Matrix(3, 3)
-    # 截取图片
-    pix = page.get_pixmap(clip=rect, matrix=zoom)
-
-    byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
-
-    imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
-
-    return img_hash256_path
--- a/magic_pdf/libs/safe_filename.py
+++ b/magic_pdf/libs/safe_filename.py
-import os
-
-
-def sanitize_filename(filename, replacement="_"):
-    if os.name == 'nt':
-        invalid_chars = '<>:"|?*'
-
-        for char in invalid_chars:
-            filename = filename.replace(char, replacement)
-
-    return filename
--- a/magic_pdf/libs/textbase.py
+++ b/magic_pdf/libs/textbase.py
-import math
-
-
-def __inc_dict_val(mp, key, val_inc:int):
-    if mp.get(key):
-        mp[key] = mp[key] + val_inc
-    else:
-        mp[key] = val_inc
-        
-    
-
-def get_text_block_base_info(block):
-    """
-    获取这个文本块里的字体的颜色、字号、字体
-    按照正文字数最多的返回
-    """
-    
-    counter = {}
-    
-    for line in block['lines']:
-        for span in line['spans']:
-            color = span['color']
-            size = round(span['size'], 2)
-            font = span['font']
-            
-            txt_len = len(span['text'])
-            __inc_dict_val(counter, (color, size, font), txt_len)
-            
-    
-    c, s, ft = max(counter, key=counter.get)
-    
-    return c, s, ft
-    
\ No newline at end of file
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
-__version__ = "0.8.1"
--- a/magic_pdf/libs/vis_utils.py
+++ b/magic_pdf/libs/vis_utils.py
-from magic_pdf.libs.commons import fitz
-import os
-
-
-def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(save_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(save_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-
-    color_map = {
-        'image': fitz.pdfcolor["yellow"],
-        'text': fitz.pdfcolor['blue'],
-        "table": fitz.pdfcolor['green']
-    }
-    
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        width = raw_pdf_doc[page_idx].rect.width
-        height = raw_pdf_doc[page_idx].rect.height
-        new_page = doc.new_page(width=width, height=height)
-
-        shape = new_page.new_shape()
-        for order, block in enumerate(v['preproc_blocks']):
-            rect = fitz.Rect(block['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-            
-        for img in v['images']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
-            shape.finish()
-            shape.commit()
-
-        for img in v['image_backup']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['yellow'],  fill=None)
-            shape.finish()
-            shape.commit()
-            
-        for tb in v['droped_text_block']:
-            # 原始box画上去
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
-            shape.finish()
-            shape.commit()
-            
-        # TODO table
-        for tb in v['tables']:
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-
-
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-
-    if is_new_pdf:
-        doc.save(save_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-    
-
-def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list,  expect_drop_bboxes:list, save_path: str, expected_page_id:int):
-    """
-    以覆盖的方式写个临时的pdf，用于debug
-    """
-    if page_idx!=expected_page_id:
-        return
-        
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-
-    width = raw_pdf_doc[page_idx].rect.width
-    height = raw_pdf_doc[page_idx].rect.height
-    new_page = doc.new_page(width=width, height=height)
-
-    shape = new_page.new_shape()
-    for bbox in bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-        
-    for bbox in droped_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-        
-    for bbox in expect_drop_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-
-    doc.save(save_path)
-    doc.close()
-    
-
-def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
-    save_path = "./tmp/debug.pdf"
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-
-    width = page.rect.width
-    height = page.rect.height
-    new_page = doc.new_page(width=width, height=height)
-    
-    shape = new_page.new_shape()
-    for bbox in bboxes1:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-        
-    for bbox in bboxes2:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-        
-    for bbox in bboxes3:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-        
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-
-    doc.save(save_path)
-    doc.close() 
-    
-    
-    
-    
-def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        layouts = v['layout_bboxes']
-        page = doc[page_idx]
-        shape = page.new_shape()
-        for order, layout in enumerate(layouts):
-            border_offset = 1
-            rect_box = layout['layout_bbox']
-            layout_label = layout['layout_label']
-            fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
-            """
-            draw order text on layout box
-            """
-            font_size = 10
-            shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
-        
-        """画上footer header"""
-        if header:
-            shape.draw_rect(fitz.Rect(header))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        if footer:
-            shape.draw_rect(fitz.Rect(footer))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        
-        shape.commit()
-    
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-        
-
-@DeprecationWarning
-def draw_layout_on_page(raw_pdf_doc: fitz.Document,  page_idx: int, page_layout: list, pdf_path: str):
-    """
-    把layout的box用红色边框花在pdf_path的page_idx上
-    """
-    def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
-        border_offset = 1
-        rect_box = layout['layout_bbox']
-        layout_label = layout['layout_label']
-        sub_layout = layout['sub_layout']
-        if len(sub_layout)==0:
-            fill_color = fill_color if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
-            # if layout_label=='U':
-            #     bad_boxes = layout.get("bad_boxes", [])
-            #     for bad_box in bad_boxes:
-            #         rect = fitz.Rect(*bad_box)
-            #         shape.draw_rect(rect)
-            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
-        # else:
-        #     rect = fitz.Rect(*rect_box)
-        #     shape.draw_rect(rect)
-        #     shape.finish(color=fitz.pdfcolor['blue'])
-        
-        for sub_layout in sub_layout:
-            draw(shape, sub_layout)
-        shape.commit()
-        
-    
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-
-    page = doc[page_idx]
-    shape = page.new_shape()
-    for order, layout in enumerate(page_layout):
-        draw(shape, layout, fitz.pdfcolor['yellow'])
-
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-
-    parent_dir = os.path.dirname(pdf_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-    
--- a/magic_pdf/model/__init__.py
+++ b/magic_pdf/model/__init__.py
-__use_inside_model__ = True
-__model_mode__ = "full"
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
-import time
-
-import fitz
-import numpy as np
-from loguru import logger
-
-from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
-from magic_pdf.model.model_list import MODEL
-import magic_pdf.model as model_config
-
-
-def dict_compare(d1, d2):
-    return d1.items() == d2.items()
-
-
-def remove_duplicates_dicts(lst):
-    unique_dicts = []
-    for dict_item in lst:
-        if not any(
-                dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
-        ):
-            unique_dicts.append(dict_item)
-    return unique_dicts
-
-
-def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
-    try:
-        from PIL import Image
-    except ImportError:
-        logger.error("Pillow not installed, please install by pip.")
-        exit(1)
-
-    images = []
-    with fitz.open("pdf", pdf_bytes) as doc:
-        for index in range(0, doc.page_count):
-            page = doc[index]
-            mat = fitz.Matrix(dpi / 72, dpi / 72)
-            pm = page.get_pixmap(matrix=mat, alpha=False)
-
-            # If the width or height exceeds 9000 after scaling, do not scale further.
-            if pm.width > 9000 or pm.height > 9000:
-                pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-
-            img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
-            img = np.array(img)
-            img_dict = {"img": img, "width": pm.width, "height": pm.height}
-            images.append(img_dict)
-    return images
-
-
-class ModelSingleton:
-    _instance = None
-    _models = {}
-
-    def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-
-    def get_model(self, ocr: bool, show_log: bool):
-        key = (ocr, show_log)
-        if key not in self._models:
-            self._models[key] = custom_model_init(ocr=ocr, show_log=show_log)
-        return self._models[key]
-
-
-def custom_model_init(ocr: bool = False, show_log: bool = False):
-    model = None
-
-    if model_config.__model_mode__ == "lite":
-        logger.warning("The Lite mode is provided for developers to conduct testing only, and the output quality is "
-                       "not guaranteed to be reliable.")
-        model = MODEL.Paddle
-    elif model_config.__model_mode__ == "full":
-        # 使用 pdf_extract_kit
-        model = MODEL.PEK
-
-    if model_config.__use_inside_model__:
-        model_init_start = time.time()
-        if model == MODEL.Paddle:
-            from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
-            custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
-        elif model == MODEL.PEK:
-            from magic_pdf.model.pdf_extract_kit import CustomPEKModel
-            # 从配置文件读取model-dir和device
-            local_models_dir = get_local_models_dir()
-            device = get_device()
-            table_config = get_table_recog_config()
-            model_input = {"ocr": ocr,
-                           "show_log": show_log,
-                           "models_dir": local_models_dir,
-                           "device": device,
-                           "table_config": table_config}
-            custom_model = CustomPEKModel(**model_input)
-        else:
-            logger.error("Not allow model_name!")
-            exit(1)
-        model_init_cost = round(time.time() - model_init_start,2)
-        logger.info(f"model init cost: {model_init_cost}")
-    else:
-        logger.error("use_inside_model is False, not allow to use inside model")
-        exit(1)
-
-    return custom_model
-
-
-def doc_analyze(model,pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
-                start_page_id=0, end_page_id=None):
-
-    # model_manager = ModelSingleton()
-    # custom_model = model_manager.get_model(ocr, show_log)
-    custom_model = model
-
-    images = load_images_from_pdf(pdf_bytes)
-
-    # end_page_id = end_page_id if end_page_id else len(images) - 1
-    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(images) - 1
-
-    if end_page_id > len(images) - 1:
-        logger.warning("end_page_id is out of range, use images length")
-        end_page_id = len(images) - 1
-
-    model_json = []
-    doc_analyze_start = time.time()
-
-    for index, img_dict in enumerate(images):
-        img = img_dict["img"]
-        page_width = img_dict["width"]
-        page_height = img_dict["height"]
-        if start_page_id <= index <= end_page_id:
-            result = custom_model(img,index,end_page_id)
-        else:
-            result = []
-        page_info = {"page_no": index, "height": page_height, "width": page_width}
-        page_dict = {"layout_dets": result, "page_info": page_info}
-        model_json.append(page_dict)
-    doc_analyze_cost = round(time.time() - doc_analyze_start,2)
-    logger.info(f"文件分析提取截图共耗时: {doc_analyze_cost}")
-#    logger.info(f'model_json:\n{model_json}')
-    return model_json
-
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
-import json
-
-from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
-                                    bbox_relative_pos, box_area, calculate_iou,
-                                    calculate_overlap_area_in_bbox1_area_ratio,
-                                    get_overlap_area)
-from magic_pdf.libs.commons import fitz, join_path
-from magic_pdf.libs.coordinate_transform import get_scale_ratio
-from magic_pdf.libs.local_math import float_gt
-from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
-from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-
-CAPATION_OVERLAP_AREA_RATIO = 0.6
-MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
-
-
-class MagicModel:
-    """每个函数没有得到元素的时候返回空list."""
-
-    def __fix_axis(self):
-        for model_page_info in self.__model_list:
-            need_remove_list = []
-            page_no = model_page_info['page_info']['page_no']
-            horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
-                model_page_info, self.__docs[page_no]
-            )
-            layout_dets = model_page_info['layout_dets']
-            for layout_det in layout_dets:
-
-                if layout_det.get('bbox') is not None:
-                    # 兼容直接输出bbox的模型数据,如paddle
-                    x0, y0, x1, y1 = layout_det['bbox']
-                else:
-                    # 兼容直接输出poly的模型数据，如xxx
-                    x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
-
-                bbox = [
-                    int(x0 / horizontal_scale_ratio),
-                    int(y0 / vertical_scale_ratio),
-                    int(x1 / horizontal_scale_ratio),
-                    int(y1 / vertical_scale_ratio),
-                ]
-                layout_det['bbox'] = bbox
-                # 删除高度或者宽度小于等于0的spans
-                if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
-                    need_remove_list.append(layout_det)
-            for need_remove in need_remove_list:
-                layout_dets.remove(need_remove)
-
-    def __fix_by_remove_low_confidence(self):
-        for model_page_info in self.__model_list:
-            need_remove_list = []
-            layout_dets = model_page_info['layout_dets']
-            for layout_det in layout_dets:
-                if layout_det['score'] <= 0.05:
-                    need_remove_list.append(layout_det)
-                else:
-                    continue
-            for need_remove in need_remove_list:
-                layout_dets.remove(need_remove)
-
-    def __fix_by_remove_high_iou_and_low_confidence(self):
-        for model_page_info in self.__model_list:
-            need_remove_list = []
-            layout_dets = model_page_info['layout_dets']
-            for layout_det1 in layout_dets:
-                for layout_det2 in layout_dets:
-                    if layout_det1 == layout_det2:
-                        continue
-                    if layout_det1['category_id'] in [
-                        0,
-                        1,
-                        2,
-                        3,
-                        4,
-                        5,
-                        6,
-                        7,
-                        8,
-                        9,
-                    ] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
-                        if (
-                            calculate_iou(layout_det1['bbox'], layout_det2['bbox'])
-                            > 0.9
-                        ):
-                            if layout_det1['score'] < layout_det2['score']:
-                                layout_det_need_remove = layout_det1
-                            else:
-                                layout_det_need_remove = layout_det2
-
-                            if layout_det_need_remove not in need_remove_list:
-                                need_remove_list.append(layout_det_need_remove)
-                        else:
-                            continue
-                    else:
-                        continue
-            for need_remove in need_remove_list:
-                layout_dets.remove(need_remove)
-
-    def __init__(self, model_list: list, docs: fitz.Document):
-        self.__model_list = model_list
-        self.__docs = docs
-        """为所有模型数据添加bbox信息(缩放，poly->bbox)"""
-        self.__fix_axis()
-        """删除置信度特别低的模型数据(<0.05),提高质量"""
-        self.__fix_by_remove_low_confidence()
-        """删除高iou(>0.9)数据中置信度较低的那个"""
-        self.__fix_by_remove_high_iou_and_low_confidence()
-        self.__fix_footnote()
-
-    def __fix_footnote(self):
-        # 3: figure, 5: table, 7: footnote
-        for model_page_info in self.__model_list:
-            footnotes = []
-            figures = []
-            tables = []
-
-            for obj in model_page_info['layout_dets']:
-                if obj['category_id'] == 7:
-                    footnotes.append(obj)
-                elif obj['category_id'] == 3:
-                    figures.append(obj)
-                elif obj['category_id'] == 5:
-                    tables.append(obj)
-                if len(footnotes) * len(figures) == 0:
-                    continue
-            dis_figure_footnote = {}
-            dis_table_footnote = {}
-
-            for i in range(len(footnotes)):
-                for j in range(len(figures)):
-                    pos_flag_count = sum(
-                        list(
-                            map(
-                                lambda x: 1 if x else 0,
-                                bbox_relative_pos(
-                                    footnotes[i]['bbox'], figures[j]['bbox']
-                                ),
-                            )
-                        )
-                    )
-                    if pos_flag_count > 1:
-                        continue
-                    dis_figure_footnote[i] = min(
-                        bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
-                        dis_figure_footnote.get(i, float('inf')),
-                    )
-            for i in range(len(footnotes)):
-                for j in range(len(tables)):
-                    pos_flag_count = sum(
-                        list(
-                            map(
-                                lambda x: 1 if x else 0,
-                                bbox_relative_pos(
-                                    footnotes[i]['bbox'], tables[j]['bbox']
-                                ),
-                            )
-                        )
-                    )
-                    if pos_flag_count > 1:
-                        continue
-
-                    dis_table_footnote[i] = min(
-                        bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
-                        dis_table_footnote.get(i, float('inf')),
-                    )
-            for i in range(len(footnotes)):
-                if i not in dis_figure_footnote:
-                    continue
-                if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
-                    footnotes[i]['category_id'] = CategoryId.ImageFootnote
-
-    def __reduct_overlap(self, bboxes):
-        N = len(bboxes)
-        keep = [True] * N
-        for i in range(N):
-            for j in range(N):
-                if i == j:
-                    continue
-                if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']):
-                    keep[i] = False
-        return [bboxes[i] for i in range(N) if keep[i]]
-
-    def __tie_up_category_by_distance(
-        self, page_no, subject_category_id, object_category_id
-    ):
-        """假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object)，每个 object
-        只能属于一个 subject."""
-        ret = []
-        MAX_DIS_OF_POINT = 10**9 + 7
-        """
-        subject 和 object 的 bbox 会合并成一个大的 bbox （named: merged bbox）。
-        筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
-        再求出筛选出的 subjects 和 object 的最短距离
-        """
-        def search_overlap_between_boxes(
-            subject_idx, object_idx
-        ):
-            idxes = [subject_idx, object_idx]
-            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
-            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
-            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
-            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
-
-            merged_bbox = [
-                min(x0s),
-                min(y0s),
-                max(x1s),
-                max(y1s),
-            ]
-            ratio = 0
-
-            other_objects = list(
-                map(
-                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
-                    filter(
-                        lambda x: x['category_id']
-                        not in (object_category_id, subject_category_id),
-                        self.__model_list[page_no]['layout_dets'],
-                    ),
-                )
-            )
-            for other_object in other_objects:
-                ratio = max(
-                    ratio,
-                    get_overlap_area(
-                        merged_bbox, other_object['bbox']
-                    ) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
-                )
-                if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
-                    break
-
-            return ratio
-
-        def may_find_other_nearest_bbox(subject_idx, object_idx):
-            ret = float('inf')
-
-            x0 = min(
-                all_bboxes[subject_idx]['bbox'][0], all_bboxes[object_idx]['bbox'][0]
-            )
-            y0 = min(
-                all_bboxes[subject_idx]['bbox'][1], all_bboxes[object_idx]['bbox'][1]
-            )
-            x1 = max(
-                all_bboxes[subject_idx]['bbox'][2], all_bboxes[object_idx]['bbox'][2]
-            )
-            y1 = max(
-                all_bboxes[subject_idx]['bbox'][3], all_bboxes[object_idx]['bbox'][3]
-            )
-
-            object_area = abs(
-                all_bboxes[object_idx]['bbox'][2] - all_bboxes[object_idx]['bbox'][0]
-            ) * abs(
-                all_bboxes[object_idx]['bbox'][3] - all_bboxes[object_idx]['bbox'][1]
-            )
-
-            for i in range(len(all_bboxes)):
-                if (
-                    i == subject_idx
-                    or all_bboxes[i]['category_id'] != subject_category_id
-                ):
-                    continue
-                if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]['bbox']) or _is_in(
-                    all_bboxes[i]['bbox'], [x0, y0, x1, y1]
-                ):
-
-                    i_area = abs(
-                        all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
-                    ) * abs(all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1])
-                    if i_area >= object_area:
-                        ret = min(float('inf'), dis[i][object_idx])
-
-            return ret
-
-        def expand_bbbox(idxes):
-            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
-            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
-            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
-            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
-            return min(x0s), min(y0s), max(x1s), max(y1s)
-
-        subjects = self.__reduct_overlap(
-            list(
-                map(
-                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
-                    filter(
-                        lambda x: x['category_id'] == subject_category_id,
-                        self.__model_list[page_no]['layout_dets'],
-                    ),
-                )
-            )
-        )
-
-        objects = self.__reduct_overlap(
-            list(
-                map(
-                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
-                    filter(
-                        lambda x: x['category_id'] == object_category_id,
-                        self.__model_list[page_no]['layout_dets'],
-                    ),
-                )
-            )
-        )
-        subject_object_relation_map = {}
-
-        subjects.sort(
-            key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2
-        )  # get the distance !
-
-        all_bboxes = []
-
-        for v in subjects:
-            all_bboxes.append(
-                {
-                    'category_id': subject_category_id,
-                    'bbox': v['bbox'],
-                    'score': v['score'],
-                }
-            )
-
-        for v in objects:
-            all_bboxes.append(
-                {
-                    'category_id': object_category_id,
-                    'bbox': v['bbox'],
-                    'score': v['score'],
-                }
-            )
-
-        N = len(all_bboxes)
-        dis = [[MAX_DIS_OF_POINT] * N for _ in range(N)]
-
-        for i in range(N):
-            for j in range(i):
-                if (
-                    all_bboxes[i]['category_id'] == subject_category_id
-                    and all_bboxes[j]['category_id'] == subject_category_id
-                ):
-                    continue
-
-                subject_idx, object_idx = i, j
-                if all_bboxes[j]['category_id'] == subject_category_id:
-                    subject_idx, object_idx = j, i
-
-                if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
-                    dis[i][j] = float('inf')
-                    dis[j][i] = dis[i][j]
-                    continue
-
-                dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
-                dis[j][i] = dis[i][j]
-
-        used = set()
-        for i in range(N):
-            # 求第 i 个 subject 所关联的 object
-            if all_bboxes[i]['category_id'] != subject_category_id:
-                continue
-            seen = set()
-            candidates = []
-            arr = []
-            for j in range(N):
-
-                pos_flag_count = sum(
-                    list(
-                        map(
-                            lambda x: 1 if x else 0,
-                            bbox_relative_pos(
-                                all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
-                            ),
-                        )
-                    )
-                )
-                if pos_flag_count > 1:
-                    continue
-                if (
-                    all_bboxes[j]['category_id'] != object_category_id
-                    or j in used
-                    or dis[i][j] == MAX_DIS_OF_POINT
-                ):
-                    continue
-                left, right, _, _ = bbox_relative_pos(
-                    all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
-                )  # 由  pos_flag_count 相关逻辑保证本段逻辑准确性
-                if left or right:
-                    one_way_dis = all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
-                else:
-                    one_way_dis = all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1]
-                if dis[i][j] > one_way_dis:
-                    continue
-                arr.append((dis[i][j], j))
-
-            arr.sort(key=lambda x: x[0])
-            if len(arr) > 0:
-                """
-                bug: 离该subject 最近的 object 可能跨越了其它的 subject。
-                比如 [this subect] [some sbuject] [the nearest object of subject]
-                """
-                if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
-
-                    candidates.append(arr[0][1])
-                    seen.add(arr[0][1])
-
-            # 已经获取初始种子
-            for j in set(candidates):
-                tmp = []
-                for k in range(i + 1, N):
-                    pos_flag_count = sum(
-                        list(
-                            map(
-                                lambda x: 1 if x else 0,
-                                bbox_relative_pos(
-                                    all_bboxes[j]['bbox'], all_bboxes[k]['bbox']
-                                ),
-                            )
-                        )
-                    )
-
-                    if pos_flag_count > 1:
-                        continue
-
-                    if (
-                        all_bboxes[k]['category_id'] != object_category_id
-                        or k in used
-                        or k in seen
-                        or dis[j][k] == MAX_DIS_OF_POINT
-                        or dis[j][k] > dis[i][j]
-                    ):
-                        continue
-
-                    is_nearest = True
-                    for ni in range(i + 1, N):
-                        if ni in (j, k) or ni in used or ni in seen:
-                            continue
-
-                        if not float_gt(dis[ni][k], dis[j][k]):
-                            is_nearest = False
-                            break
-
-                    if is_nearest:
-                        nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
-                        n_dis = bbox_distance(
-                            all_bboxes[i]['bbox'], [nx0, ny0, nx1, ny1]
-                        )
-                        if float_gt(dis[i][j], n_dis):
-                            continue
-                        tmp.append(k)
-                        seen.add(k)
-
-                candidates = tmp
-                if len(candidates) == 0:
-                    break
-
-            # 已经获取到某个 figure 下所有的最靠近的 captions，以及最靠近这些 captions 的 captions 。
-            # 先扩一下 bbox，
-            ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
-            ix0, iy0, ix1, iy1 = all_bboxes[i]['bbox']
-
-            # 分成了 4 个截取空间，需要计算落在每个截取空间下 objects 合并后占据的矩形面积
-            caption_poses = [
-                [ox0, oy0, ix0, oy1],
-                [ox0, oy0, ox1, iy0],
-                [ox0, iy1, ox1, oy1],
-                [ix1, oy0, ox1, oy1],
-            ]
-
-            caption_areas = []
-            for bbox in caption_poses:
-                embed_arr = []
-                for idx in seen:
-                    if (
-                        calculate_overlap_area_in_bbox1_area_ratio(
-                            all_bboxes[idx]['bbox'], bbox
-                        )
-                        > CAPATION_OVERLAP_AREA_RATIO
-                    ):
-                        embed_arr.append(idx)
-
-                if len(embed_arr) > 0:
-                    embed_x0 = min([all_bboxes[idx]['bbox'][0] for idx in embed_arr])
-                    embed_y0 = min([all_bboxes[idx]['bbox'][1] for idx in embed_arr])
-                    embed_x1 = max([all_bboxes[idx]['bbox'][2] for idx in embed_arr])
-                    embed_y1 = max([all_bboxes[idx]['bbox'][3] for idx in embed_arr])
-                    caption_areas.append(
-                        int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
-                    )
-                else:
-                    caption_areas.append(0)
-
-            subject_object_relation_map[i] = []
-            if max(caption_areas) > 0:
-                max_area_idx = caption_areas.index(max(caption_areas))
-                caption_bbox = caption_poses[max_area_idx]
-
-                for j in seen:
-                    if (
-                        calculate_overlap_area_in_bbox1_area_ratio(
-                            all_bboxes[j]['bbox'], caption_bbox
-                        )
-                        > CAPATION_OVERLAP_AREA_RATIO
-                    ):
-                        used.add(j)
-                        subject_object_relation_map[i].append(j)
-
-        for i in sorted(subject_object_relation_map.keys()):
-            result = {
-                'subject_body': all_bboxes[i]['bbox'],
-                'all': all_bboxes[i]['bbox'],
-                'score': all_bboxes[i]['score'],
-            }
-
-            if len(subject_object_relation_map[i]) > 0:
-                x0 = min(
-                    [all_bboxes[j]['bbox'][0] for j in subject_object_relation_map[i]]
-                )
-                y0 = min(
-                    [all_bboxes[j]['bbox'][1] for j in subject_object_relation_map[i]]
-                )
-                x1 = max(
-                    [all_bboxes[j]['bbox'][2] for j in subject_object_relation_map[i]]
-                )
-                y1 = max(
-                    [all_bboxes[j]['bbox'][3] for j in subject_object_relation_map[i]]
-                )
-                result['object_body'] = [x0, y0, x1, y1]
-                result['all'] = [
-                    min(x0, all_bboxes[i]['bbox'][0]),
-                    min(y0, all_bboxes[i]['bbox'][1]),
-                    max(x1, all_bboxes[i]['bbox'][2]),
-                    max(y1, all_bboxes[i]['bbox'][3]),
-                ]
-            ret.append(result)
-
-        total_subject_object_dis = 0
-        # 计算已经配对的 distance 距离
-        for i in subject_object_relation_map.keys():
-            for j in subject_object_relation_map[i]:
-                total_subject_object_dis += bbox_distance(
-                    all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
-                )
-
-        # 计算未匹配的 subject 和 object 的距离（非精确版）
-        with_caption_subject = set(
-            [
-                key
-                for key in subject_object_relation_map.keys()
-                if len(subject_object_relation_map[i]) > 0
-            ]
-        )
-        for i in range(N):
-            if all_bboxes[i]['category_id'] != object_category_id or i in used:
-                continue
-            candidates = []
-            for j in range(N):
-                if (
-                    all_bboxes[j]['category_id'] != subject_category_id
-                    or j in with_caption_subject
-                ):
-                    continue
-                candidates.append((dis[i][j], j))
-            if len(candidates) > 0:
-                candidates.sort(key=lambda x: x[0])
-                total_subject_object_dis += candidates[0][1]
-                with_caption_subject.add(j)
-        return ret, total_subject_object_dis
-
-    def get_imgs(self, page_no: int):
-        with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
-        with_footnotes, _ = self.__tie_up_category_by_distance(
-            page_no, 3, CategoryId.ImageFootnote
-        )
-        ret = []
-        N, M = len(with_captions), len(with_footnotes)
-        assert N == M
-        for i in range(N):
-            record = {
-                'score': with_captions[i]['score'],
-                'img_caption_bbox': with_captions[i].get('object_body', None),
-                'img_body_bbox': with_captions[i]['subject_body'],
-                'img_footnote_bbox': with_footnotes[i].get('object_body', None),
-            }
-
-            x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
-            y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
-            x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
-            y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
-            record['bbox'] = [x0, y0, x1, y1]
-            ret.append(record)
-        return ret
-
-    def get_tables(
-        self, page_no: int
-    ) -> list:  # 3个坐标， caption, table主体，table-note
-        with_captions, _ = self.__tie_up_category_by_distance(page_no, 5, 6)
-        with_footnotes, _ = self.__tie_up_category_by_distance(page_no, 5, 7)
-        ret = []
-        N, M = len(with_captions), len(with_footnotes)
-        assert N == M
-        for i in range(N):
-            record = {
-                'score': with_captions[i]['score'],
-                'table_caption_bbox': with_captions[i].get('object_body', None),
-                'table_body_bbox': with_captions[i]['subject_body'],
-                'table_footnote_bbox': with_footnotes[i].get('object_body', None),
-            }
-
-            x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
-            y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
-            x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
-            y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
-            record['bbox'] = [x0, y0, x1, y1]
-            ret.append(record)
-        return ret
-
-    def get_equations(self, page_no: int) -> list:  # 有坐标，也有字
-        inline_equations = self.__get_blocks_by_type(
-            ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex']
-        )
-        interline_equations = self.__get_blocks_by_type(
-            ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex']
-        )
-        interline_equations_blocks = self.__get_blocks_by_type(
-            ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
-        )
-        return inline_equations, interline_equations, interline_equations_blocks
-
-    def get_discarded(self, page_no: int) -> list:  # 自研模型，只有坐标
-        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.ABANDON.value, page_no)
-        return blocks
-
-    def get_text_blocks(self, page_no: int) -> list:  # 自研模型搞的，只有坐标，没有字
-        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.PLAIN_TEXT.value, page_no)
-        return blocks
-
-    def get_title_blocks(self, page_no: int) -> list:  # 自研模型，只有坐标，没字
-        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.TITLE.value, page_no)
-        return blocks
-
-    def get_ocr_text(self, page_no: int) -> list:  # paddle 搞的，有字也有坐标
-        text_spans = []
-        model_page_info = self.__model_list[page_no]
-        layout_dets = model_page_info['layout_dets']
-        for layout_det in layout_dets:
-            if layout_det['category_id'] == '15':
-                span = {
-                    'bbox': layout_det['bbox'],
-                    'content': layout_det['text'],
-                }
-                text_spans.append(span)
-        return text_spans
-
-    def get_all_spans(self, page_no: int) -> list:
-
-        def remove_duplicate_spans(spans):
-            new_spans = []
-            for span in spans:
-                if not any(span == existing_span for existing_span in new_spans):
-                    new_spans.append(span)
-            return new_spans
-
-        all_spans = []
-        model_page_info = self.__model_list[page_no]
-        layout_dets = model_page_info['layout_dets']
-        allow_category_id_list = [3, 5, 13, 14, 15]
-        """当成span拼接的"""
-        #  3: 'image', # 图片
-        #  5: 'table',       # 表格
-        #  13: 'inline_equation',     # 行内公式
-        #  14: 'interline_equation',      # 行间公式
-        #  15: 'text',      # ocr识别文本
-        for layout_det in layout_dets:
-            category_id = layout_det['category_id']
-            if category_id in allow_category_id_list:
-                span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
-                if category_id == 3:
-                    span['type'] = ContentType.Image
-                elif category_id == 5:
-                    # 获取table模型结果
-                    latex = layout_det.get('latex', None)
-                    html = layout_det.get('html', None)
-                    if latex:
-                        span['latex'] = latex
-                    elif html:
-                        span['html'] = html
-                    span['type'] = ContentType.Table
-                elif category_id == 13:
-                    span['content'] = layout_det['latex']
-                    span['type'] = ContentType.InlineEquation
-                elif category_id == 14:
-                    span['content'] = layout_det['latex']
-                    span['type'] = ContentType.InterlineEquation
-                elif category_id == 15:
-                    span['content'] = layout_det['text']
-                    span['type'] = ContentType.Text
-                all_spans.append(span)
-        return remove_duplicate_spans(all_spans)
-
-    def get_page_size(self, page_no: int):  # 获取页面宽高
-        # 获取当前页的page对象
-        page = self.__docs[page_no]
-        # 获取当前页的宽高
-        page_w = page.rect.width
-        page_h = page.rect.height
-        return page_w, page_h
-
-    def __get_blocks_by_type(
-        self, type: int, page_no: int, extra_col: list[str] = []
-    ) -> list:
-        blocks = []
-        for page_dict in self.__model_list:
-            layout_dets = page_dict.get('layout_dets', [])
-            page_info = page_dict.get('page_info', {})
-            page_number = page_info.get('page_no', -1)
-            if page_no != page_number:
-                continue
-            for item in layout_dets:
-                category_id = item.get('category_id', -1)
-                bbox = item.get('bbox', None)
-
-                if category_id == type:
-                    block = {
-                        'bbox': bbox,
-                        'score': item.get('score'),
-                    }
-                    for col in extra_col:
-                        block[col] = item.get(col, None)
-                    blocks.append(block)
-        return blocks
-
-    def get_model_list(self, page_no):
-        return self.__model_list[page_no]
-
-
-if __name__ == '__main__':
-    drw = DiskReaderWriter(r'D:/project/20231108code-clean')
-    if 0:
-        pdf_file_path = r'linshixuqiu\19983-00.pdf'
-        model_file_path = r'linshixuqiu\19983-00_new.json'
-        pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
-        model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
-        model_list = json.loads(model_json_txt)
-        write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
-        img_bucket_path = 'imgs'
-        img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
-        pdf_docs = fitz.open('pdf', pdf_bytes)
-        magic_model = MagicModel(model_list, pdf_docs)
-
-    if 1:
-        model_list = json.loads(
-            drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
-        )
-        pdf_bytes = drw.read(
-            '/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf', AbsReaderWriter.MODE_BIN
-        )
-        pdf_docs = fitz.open('pdf', pdf_bytes)
-        magic_model = MagicModel(model_list, pdf_docs)
-        for i in range(7):
-            print(magic_model.get_imgs(i))