Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files

Deleted magic_pdf/pycache/init.cpython-310.pyc,...
Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
826086d2 · zhougaofeng · 57aaa1cf · 57aaa1cf · 57aaa1cf · 57aaa1cf
Commit 826086d2 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/pre_proc/post_layout_split.py
+++ b/magic_pdf/pre_proc/post_layout_split.py
--- a/magic_pdf/pre_proc/remove_bbox_overlap.py
+++ b/magic_pdf/pre_proc/remove_bbox_overlap.py
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
-from magic_pdf.libs.drop_reason import DropReason
-
-def _remove_overlap_between_bbox(bbox1, bbox2):
-   if _is_part_overlap(bbox1, bbox2):
-        ix0, iy0, ix1, iy1 = bbox1
-        x0, y0, x1, y1 = bbox2
-
-        diff_x = min(x1, ix1) - max(x0, ix0)
-        diff_y = min(y1, iy1) - max(y0, iy0)
-
-        if diff_y > diff_x:
-            if x1 >= ix1:
-                mid = (x0 + ix1) // 2
-                ix1 = min(mid - 0.25, ix1)
-                x0 = max(mid + 0.25, x0)
-            else:
-                mid = (ix0 + x1) // 2
-                ix0 = max(mid + 0.25, ix0)
-                x1 = min(mid - 0.25, x1)
-        else:
-            if y1 >= iy1:
-                mid = (y0 + iy1) // 2
-                y0 = max(mid + 0.25, y0)
-                iy1 = min(iy1, mid-0.25)
-            else:
-                mid = (iy0 + y1) // 2
-                y1 = min(y1, mid-0.25)
-                iy0 = max(mid + 0.25, iy0)
-
-        if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
-            bbox1 = [ix0, iy0, ix1, iy1]
-            bbox2 = [x0, y0, x1, y1]
-            return bbox1, bbox2, None
-        else:
-            return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
-   else:
-       return bbox1, bbox2, None
-
-
-def _remove_overlap_between_bboxes(arr):
-    drop_reasons = []
-    N = len(arr)
-    keeps = [True] * N
-    res = [None] * N
-    for i in range(N):
-        for j in range(N):
-            if i == j:
-                continue
-            if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
-                keeps[i] = False
-
-    for idx, v in enumerate(arr):
-        if not keeps[idx]:
-            continue
-        for i in range(N):
-            if res[i] is None:
-                continue
-        
-            bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
-            if drop_reason is None:
-                v["bbox"] = bbox1
-                res[i]["bbox"] = bbox2
-            else:
-                if v["score"] > res[i]["score"]:
-                    keeps[i] = False
-                    res[i] = None
-                else:
-                    keeps[idx] = False
-                drop_reasons.append(drop_reasons)
-        if keeps[idx]:
-            res[idx] = v
-    return res, drop_reasons
-
-
-def remove_overlap_between_bbox_for_span(spans):
-    arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
-    res, drop_reasons = _remove_overlap_between_bboxes(arr)
-    ret = []
-    for i in range(len(res)):
-        if res[i] is None:
-            continue
-        spans[i]["bbox"] = res[i]["bbox"]
-        ret.append(spans[i])
-    return ret, drop_reasons
-
-
-def remove_overlap_between_bbox_for_block(all_bboxes):
-    arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
-    res, drop_reasons = _remove_overlap_between_bboxes(arr)
-    ret = []
-    for i in range(len(res)):
-        if res[i] is None:
-            continue
-        all_bboxes[i][:4] = res[i]["bbox"]
-        ret.append(all_bboxes[i])
-    return ret, drop_reasons
-
--- a/magic_pdf/pre_proc/remove_colored_strip_bbox.py
+++ b/magic_pdf/pre_proc/remove_colored_strip_bbox.py
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
-from loguru import logger
-
-from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
-
-
-def __area(box):
-    return (box[2] - box[0]) * (box[3] - box[1])
-
-
-def rectangle_position_determination(rect, p_width):
-    """
-    判断矩形是否在页面中轴线附近。
-
-    Args:
-        rect (list): 矩形坐标，格式为[x1, y1, x2, y2]。
-        p_width (int): 页面宽度。
-
-    Returns:
-        bool: 若矩形在页面中轴线附近则返回True，否则返回False。
-    """
-    # 页面中轴线x坐标
-    x_axis = p_width / 2
-    # 矩形是否跨越中轴线
-    is_span = rect[0] < x_axis and rect[2] > x_axis
-    if is_span:
-        return True
-    else:
-        # 矩形与中轴线的距离，只算近的那一边
-        distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
-        # 判断矩形与中轴线的距离是否小于页面宽度的20%
-        if distance < p_width * 0.2:
-            return True
-        else:
-            return False
-
-def remove_colored_strip_textblock(remain_text_blocks, page):
-    """
-    根据页面中特定颜色和大小过滤文本块，将符合条件的文本块从remain_text_blocks中移除，并返回移除的文本块列表colored_strip_textblock。
-
-    Args:
-        remain_text_blocks (list): 剩余文本块列表。
-        page (Page): 页面对象。
-
-    Returns:
-        tuple: 剩余文本块列表和移除的文本块列表。
-    """
-    colored_strip_textblocks = []  # 先构造一个空的返回
-    if len(remain_text_blocks) > 0:
-        p_width, p_height = page.rect.width, page.rect.height
-        blocks = page.get_cdrawings()
-        colored_strip_bg_rect = []
-        for block in blocks:
-            is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)  # 过滤掉透明的
-            rect = block['rect']
-            area_is_large_enough = __area(rect) > 100  # 过滤掉特别小的矩形
-            rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
-            in_upper_half_page = rect[3] < p_height * 0.3  # 找到位于页面上半部分的矩形，下边界小于页面高度的30%
-            aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4  # 找到长宽比超过4的矩形
-
-            if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
-                colored_strip_bg_rect.append(rect)
-
-        if len(colored_strip_bg_rect) > 0:
-            for colored_strip_block_bbox in colored_strip_bg_rect:
-                for text_block in remain_text_blocks:
-                    text_bbox = text_block['bbox']
-                    if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
-                        logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
-                        text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
-                        colored_strip_textblocks.append(text_block)
-
-                if len(colored_strip_textblocks) > 0:
-                    for colored_strip_textblock in colored_strip_textblocks:
-                        if colored_strip_textblock in remain_text_blocks:
-                            remain_text_blocks.remove(colored_strip_textblock)
-
-    return remain_text_blocks, colored_strip_textblocks
-
--- a/magic_pdf/pre_proc/remove_footer_header.py
+++ b/magic_pdf/pre_proc/remove_footer_header.py
-import re
-
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap
-from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
-
-
-def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
-                                   page_no_bboxs, page_w, page_h):
-    """
-    删除页眉页脚，页码
-    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
-    """
-    header = []
-    footer = []
-    if len(header) == 0:
-        model_header = header_bboxs
-        if model_header:
-            x0 = min([x for x, _, _, _ in model_header])
-            y0 = min([y for _, y, _, _ in model_header])
-            x1 = max([x1 for _, _, x1, _ in model_header])
-            y1 = max([y1 for _, _, _, y1 in model_header])
-            header = [x0, y0, x1, y1]
-    if len(footer) == 0:
-        model_footer = footer_bboxs
-        if model_footer:
-            x0 = min([x for x, _, _, _ in model_footer])
-            y0 = min([y for _, y, _, _ in model_footer])
-            x1 = max([x1 for _, _, x1, _ in model_footer])
-            y1 = max([y1 for _, _, _, y1 in model_footer])
-            footer = [x0, y0, x1, y1]
-
-    header_y0 = 0 if len(header) == 0 else header[3]
-    footer_y0 = page_h if len(footer) == 0 else footer[1]
-    if page_no_bboxs:
-        top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
-        btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
-
-        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
-        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
-
-        header_y0 = max(header_y0, top_max_y0)
-        footer_y0 = min(footer_y0, btn_min_y1)
-
-    content_boundry = [0, header_y0, page_w, footer_y0]
-
-    header = [0, 0, page_w, header_y0]
-    footer = [0, footer_y0, page_w, page_h]
-
-    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
-    text_block_to_remove = []
-    # 首先检查每个textblock
-    for blk in text_raw_blocks:
-        if len(blk['lines']) > 0:
-            for line in blk['lines']:
-                line_del = []
-                for span in line['spans']:
-                    span_del = []
-                    if span['bbox'][3] < header_y0:
-                        span_del.append(span)
-                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
-                        span_del.append(span)
-                for span in span_del:
-                    line['spans'].remove(span)
-                if not line['spans']:
-                    line_del.append(line)
-
-            for line in line_del:
-                blk['lines'].remove(line)
-        else:
-            # if not blk['lines']:
-            blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
-            text_block_to_remove.append(blk)
-
-    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
-    page_no_block_2_remove = []
-    if page_no_bboxs:
-        for pagenobox in page_no_bboxs:
-            for block in text_raw_blocks:
-                if _is_in_or_part_overlap(pagenobox, block['bbox']):  # 在span级别删除页码
-                    for line in block['lines']:
-                        for span in line['spans']:
-                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
-                                # span['text'] = ''
-                                span['tag'] = PAGE_NO
-                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
-                                if len(line['spans']) == 1 and len(block['lines']) == 1:
-                                    page_no_block_2_remove.append(block)
-    else:
-        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
-        if len(text_raw_blocks) > 0:
-            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
-            last_block = text_raw_blocks[0]
-            if len(last_block['lines']) == 1:
-                last_line = last_block['lines'][0]
-                if len(last_line['spans']) == 1:
-                    last_span = last_line['spans'][0]
-                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
-                                                                                                                last_span[
-                                                                                                                    'text']):
-                        last_span['tag'] = PAGE_NO
-                        page_no_block_2_remove.append(last_block)
-
-    for b in page_no_block_2_remove:
-        text_block_to_remove.append(b)
-
-    for blk in text_block_to_remove:
-        if blk in text_raw_blocks:
-            text_raw_blocks.remove(blk)
-
-    text_block_remain = text_raw_blocks
-    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
-
-    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
-    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
-    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
-
-    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
--- a/magic_pdf/pre_proc/remove_rotate_bbox.py
+++ b/magic_pdf/pre_proc/remove_rotate_bbox.py
-import math
-
-from magic_pdf.libs.boxbase import is_vbox_on_side
-from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
-
-
-def detect_non_horizontal_texts(result_dict):
-    """
-    This function detects watermarks and vertical margin notes in the document.
-
-    Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-    If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
-    If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
-
-    Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
-    If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
-
-
-    Parameters
-    ----------
-    result_dict : dict
-        The result dictionary.
-
-    Returns
-    -------
-    result_dict : dict
-        The updated result dictionary.
-    """
-    # Dictionary to store information about potential watermarks
-    potential_watermarks = {}
-    potential_margin_notes = {}
-
-    for page_id, page_content in result_dict.items():
-        if page_id.startswith("page_"):
-            for block_id, block_data in page_content.items():
-                if block_id.startswith("block_"):
-                    if "dir" in block_data:
-                        coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
-
-                        angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
-                        angle = abs(math.degrees(angle))
-
-                        if angle > 5 and angle < 85:  # Check if direction is watermarks
-                            if coordinates_text in potential_watermarks:
-                                potential_watermarks[coordinates_text] += 1
-                            else:
-                                potential_watermarks[coordinates_text] = 1
-
-                        if angle > 85 and angle < 105:  # Check if direction is vertical
-                            if coordinates_text in potential_margin_notes:
-                                potential_margin_notes[coordinates_text] += 1  # Increment count
-                            else:
-                                potential_margin_notes[coordinates_text] = 1  # Initialize count
-
-    # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-    watermark_threshold = len(result_dict) // 2
-    watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
-
-    # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-    margin_note_threshold = len(result_dict) // 2
-    margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
-
-    # Add watermark information to the result dictionary
-    for page_id, blocks in result_dict.items():
-        if page_id.startswith("page_"):
-            for block_id, block_data in blocks.items():
-                coordinates_text = (block_data["bbox"], block_data["text"])
-                if coordinates_text in watermarks:
-                    block_data["is_watermark"] = 1
-                else:
-                    block_data["is_watermark"] = 0
-
-                if coordinates_text in margin_notes:
-                    block_data["is_vertical_margin_note"] = 1
-                else:
-                    block_data["is_vertical_margin_note"] = 0
-
-    return result_dict
-
-
-"""
-1. 当一个block里全部文字都不是dir=(1,0)，这个block整体去掉
-2. 当一个block里全部文字都是dir=(1,0)，但是每行只有一个字，这个block整体去掉。这个block必须出现在页面的四周，否则不去掉
-"""
-import re
-
-def __is_a_word(sentence):
-    # 如果输入是中文并且长度为1，则返回True
-    if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
-        return True
-    # 判断是否为单个英文单词或字符（包括ASCII标点）
-    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
-        return True
-    else:
-        return False
-
-
-def __get_text_color(num):
-    """获取字体的颜色RGB值"""
-    blue = num & 255
-    green = (num >> 8) & 255
-    red = (num >> 16) & 255
-    return red, green, blue
-
-
-def __is_empty_side_box(text_block):
-    """
-    是否是边缘上的空白没有任何内容的block
-    """
-    for line in text_block['lines']:
-        for span in line['spans']:
-            font_color = span['color']
-            r,g,b = __get_text_color(font_color)
-            if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
-                return False
-            
-    return True
-
-
-def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
-    """
-    返回删除了垂直，水印，旋转的textblock
-    删除的内容打上tag返回
-    """
-    removed_text_block = []
-    
-    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
-        lines = block['lines']
-        block_bbox = block['bbox']
-        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
-           continue
-        
-        if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
-            is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1)  # 测试bbox在垂直方向是不是x0都相等，也就是在垂直方向排列.同时必须大于等于2个字
-            
-            if is_box_valign:
-                block['tag'] = VERTICAL_TEXT
-                removed_text_block.append(block)
-                continue
-        
-        for line in lines:
-            if line['dir']!=(1,0):
-                block['tag'] = ROTATE_TEXT
-                removed_text_block.append(block) # 只要有一个line不是dir=(1,0)，就把整个block都删掉
-                break
-        
-    for block in removed_text_block:
-        pymu_text_block.remove(block)
-    
-    return pymu_text_block, removed_text_block
-
-def get_side_boundry(rotate_bbox, page_width, page_height):
-    """
-    根据rotate_bbox，返回页面的左右正文边界
-    """
-    left_x = 0
-    right_x = page_width
-    for x in rotate_bbox:
-        box = x['bbox']
-        if box[2]<page_width/2:
-            left_x = max(left_x, box[2])
-        else:
-            right_x = min(right_x, box[0])
-            
-    return left_x+1, right_x-1
-
-
-def remove_side_blank_block(pymu_text_block, page_width, page_height):
-    """
-    删除页面两侧的空白block
-    """
-    removed_text_block = []
-    
-    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
-        block_bbox = block['bbox']
-        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
-           continue
-            
-        if __is_empty_side_box(block):
-            block['tag'] = EMPTY_SIDE_BLOCK
-            removed_text_block.append(block)
-            continue
-        
-    for block in removed_text_block:
-        pymu_text_block.remove(block)
-    
-    return pymu_text_block, removed_text_block
\ No newline at end of file
--- a/magic_pdf/pre_proc/resolve_bbox_conflict.py
+++ b/magic_pdf/pre_proc/resolve_bbox_conflict.py
-"""
-从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
-1. 首先去掉出现在图片上的bbox，图片包括表格和图片
-2. 然后去掉出现在文字blcok上的图片bbox
-"""
-
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
-from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
-
-
-def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
-                                  text_raw_blocks: list):
-    """
-    text_raw_blocks结构是从pymupdf里直接取到的结构，具体样例参考test/assets/papre/pymu_textblocks.json
-    当下采用一种粗暴的方式：
-    1. 去掉图片上的公式
-    2. 去掉table上的公式
-    2. 图片和文字block部分重叠，首先丢弃图片
-    3. 图片和图片重叠，修改图片的bbox，使得图片不重叠(暂时没这么做，先把图片都扔掉)
-    4. 去掉文字bbox里位于图片、表格上的文字（一定要完全在图、表内部）
-    5. 去掉表格上的文字
-    """
-    text_block_removed = []
-    images_backup = []
-
-    # 去掉位于图片上的文字block
-    for image_box in images:
-        for text_block in text_raw_blocks:
-            text_bbox = text_block["bbox"]
-            if _is_in(text_bbox, image_box):
-                text_block['tag'] = ON_IMAGE_TEXT
-                text_block_removed.append(text_block)
-    # 去掉table上的文字block
-    for table_box in tables:
-        for text_block in text_raw_blocks:
-            text_bbox = text_block["bbox"]
-            if _is_in(text_bbox, table_box):
-                text_block['tag'] = ON_TABLE_TEXT
-                text_block_removed.append(text_block)
-
-    for text_block in text_block_removed:
-        if text_block in text_raw_blocks:
-            text_raw_blocks.remove(text_block)
-
-    # 第一步去掉在图片上出现的公式box
-    temp = []
-    for image_box in images:
-        for eq1 in interline_equations:
-            if _is_in_or_part_overlap(image_box, eq1[:4]):
-                temp.append(eq1)
-        for eq2 in inline_equations:
-            if _is_in_or_part_overlap(image_box, eq2[:4]):
-                temp.append(eq2)
-
-    for eq in temp:
-        if eq in interline_equations:
-            interline_equations.remove(eq)
-        if eq in inline_equations:
-            inline_equations.remove(eq)
-
-    # 第二步去掉在表格上出现的公式box
-    temp = []
-    for table_box in tables:
-        for eq1 in interline_equations:
-            if _is_in_or_part_overlap(table_box, eq1[:4]):
-                temp.append(eq1)
-        for eq2 in inline_equations:
-            if _is_in_or_part_overlap(table_box, eq2[:4]):
-                temp.append(eq2)
-
-    for eq in temp:
-        if eq in interline_equations:
-            interline_equations.remove(eq)
-        if eq in inline_equations:
-            inline_equations.remove(eq)
-
-    # 图片和文字重叠，丢掉图片
-    for image_box in images:
-        for text_block in text_raw_blocks:
-            text_bbox = text_block["bbox"]
-            if _is_in_or_part_overlap(image_box, text_bbox):
-                images_backup.append(image_box)
-                break
-    for image_box in images_backup:
-        images.remove(image_box)
-
-    # 图片和图片重叠，两张都暂时不参与版面计算
-    images_dup_index = []
-    for i in range(len(images)):
-        for j in range(i + 1, len(images)):
-            if _is_in_or_part_overlap(images[i], images[j]):
-                images_dup_index.append(i)
-                images_dup_index.append(j)
-
-    dup_idx = set(images_dup_index)
-    for img_id in dup_idx:
-        images_backup.append(images[img_id])
-        images[img_id] = None
-
-    images = [img for img in images if img is not None]
-
-    # 如果行间公式和文字block重叠，放到临时的数据里，防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
-    # 对于这样的文本块删除，然后保留行间公式的大小不变。
-    # 当计算完毕layout，这部分再合并回来
-    text_block_removed_2 = []
-    # for text_block in text_raw_blocks:
-    #     text_bbox = text_block["bbox"]
-    #     for eq in interline_equations:
-    #         ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
-    #         if ratio>0.05:
-    #             text_block['tag'] = "belong-to-interline-equation"
-    #             text_block_removed_2.append(text_block)
-    #             break
-
-    # for tb in text_block_removed_2:
-    #     if tb in text_raw_blocks:
-    #         text_raw_blocks.remove(tb)
-
-    # text_block_removed = text_block_removed + text_block_removed_2
-
-    return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
-
-
-def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
-    """
-    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
-    因为这种情况大概率发生了公式没有被检测出来。
-    
-    """
-    if len(text_blocks) == 0:
-        return False
-
-    page_min_y = 0
-    page_max_y = max(yy['bbox'][3] for yy in text_blocks)
-
-    def __max_y(lst: list):
-        if len(lst) > 0:
-            return max([item[1] for item in lst])
-        return page_min_y
-
-    def __min_y(lst: list):
-        if len(lst) > 0:
-            return min([item[3] for item in lst])
-        return page_max_y
-
-    clip_y0 = __max_y(header)
-    clip_y1 = __min_y(footer)
-
-    txt_bboxes = []
-    for text_block in text_blocks:
-        bbox = text_block["bbox"]
-        if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
-            txt_bboxes.append(bbox)
-
-    for i in range(len(txt_bboxes)):
-        for j in range(i + 1, len(txt_bboxes)):
-            if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
-                return True
-
-    return False
-
-
-def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
-    """
-    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
-    因为这种情况大概率发生了公式没有被检测出来。
-
-    """
-    if len(useful_blocks) == 0:
-        return False
-
-    page_min_y = 0
-    page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
-
-    useful_bboxes = []
-    for text_block in useful_blocks:
-        bbox = text_block["bbox"]
-        if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
-            useful_bboxes.append(bbox)
-
-    for i in range(len(useful_bboxes)):
-        for j in range(i + 1, len(useful_bboxes)):
-            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
-            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
-            if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
-                if area_i > area_j:
-                    return True, useful_bboxes[j], useful_bboxes[i]
-                else:
-                    return True, useful_bboxes[i], useful_bboxes[j]
-
-    return False, None, None
--- a/magic_pdf/pre_proc/solve_line_alien.py
+++ b/magic_pdf/pre_proc/solve_line_alien.py
-def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict:  # text_block -> json中的preproc_block
-    """解决行内文本间距过大问题"""
-    for i in range(len(pdf_info_dict)):
-
-        text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
-
-        for block in text_blocks:
-
-            x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
-            
-            for line in block['lines']:
-
-                x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
-                # line_box = [x1, y1, x2, y2] 
-                if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
-                    # if len(line['spans']) == 1:
-                    line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
-                
-                x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] 
-
-    return pdf_info_dict
-
-
-
-
-
-
-
-
--- a/magic_pdf/pre_proc/statistics.py
+++ b/magic_pdf/pre_proc/statistics.py
-
-"""
-统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
-
-"""
-
--- a/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
+++ b/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
--- a/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
-model:
-  arch: unimernet
-  model_type: unimernet
-  model_config:
-    model_name: ./models
-    max_seq_len: 1024
-    length_aware: False
-  load_pretrained: True
-  pretrained: ./models/pytorch_model.bin
-  tokenizer_config:
-    path: ./models
-
-datasets:
-  formula_rec_eval:
-    vis_processor:
-      eval:
-        name: "formula_image_eval"
-        image_size:
-          - 192
-          - 672
-   
-run:
-  runner: runner_iter
-  task: unimernet_train
-
-  batch_size_train: 64
-  batch_size_eval: 64
-  num_workers: 1
-
-  iters_per_inner_epoch: 2000
-  max_iters: 60000
-
-  seed: 42
-  output_dir: "../output/demo"
-
-  evaluate: True
-  test_splits: [ "eval" ]
-
-  device: "cuda"
-  world_size: 1
-  dist_url: "env://"
-  distributed: True
-  distributed_type: ddp  # or fsdp when train llm
-
-  generate_cfg:
-    temperature: 0.0
--- a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
-AUG:
-  DETR: true
-CACHE_DIR: ~/cache/huggingface
-CUDNN_BENCHMARK: false
-DATALOADER:
-  ASPECT_RATIO_GROUPING: true
-  FILTER_EMPTY_ANNOTATIONS: false
-  NUM_WORKERS: 4
-  REPEAT_THRESHOLD: 0.0
-  SAMPLER_TRAIN: TrainingSampler
-DATASETS:
-  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
-  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
-  PROPOSAL_FILES_TEST: []
-  PROPOSAL_FILES_TRAIN: []
-  TEST:
-  - scihub_train
-  TRAIN:
-  - scihub_train
-GLOBAL:
-  HACK: 1.0
-ICDAR_DATA_DIR_TEST: ''
-ICDAR_DATA_DIR_TRAIN: ''
-INPUT:
-  CROP:
-    ENABLED: true
-    SIZE:
-    - 384
-    - 600
-    TYPE: absolute_range
-  FORMAT: RGB
-  MASK_FORMAT: polygon
-  MAX_SIZE_TEST: 1333
-  MAX_SIZE_TRAIN: 1333
-  MIN_SIZE_TEST: 800
-  MIN_SIZE_TRAIN:
-  - 480
-  - 512
-  - 544
-  - 576
-  - 608
-  - 640
-  - 672
-  - 704
-  - 736
-  - 768
-  - 800
-  MIN_SIZE_TRAIN_SAMPLING: choice
-  RANDOM_FLIP: horizontal
-MODEL:
-  ANCHOR_GENERATOR:
-    ANGLES:
-    - - -90
-      - 0
-      - 90
-    ASPECT_RATIOS:
-    - - 0.5
-      - 1.0
-      - 2.0
-    NAME: DefaultAnchorGenerator
-    OFFSET: 0.0
-    SIZES:
-    - - 32
-    - - 64
-    - - 128
-    - - 256
-    - - 512
-  BACKBONE:
-    FREEZE_AT: 2
-    NAME: build_vit_fpn_backbone
-  CONFIG_PATH: ''
-  DEVICE: cuda
-  FPN:
-    FUSE_TYPE: sum
-    IN_FEATURES:
-    - layer3
-    - layer5
-    - layer7
-    - layer11
-    NORM: ''
-    OUT_CHANNELS: 256
-  IMAGE_ONLY: true
-  KEYPOINT_ON: false
-  LOAD_PROPOSALS: false
-  MASK_ON: true
-  META_ARCHITECTURE: VLGeneralizedRCNN
-  PANOPTIC_FPN:
-    COMBINE:
-      ENABLED: true
-      INSTANCES_CONFIDENCE_THRESH: 0.5
-      OVERLAP_THRESH: 0.5
-      STUFF_AREA_LIMIT: 4096
-    INSTANCE_LOSS_WEIGHT: 1.0
-  PIXEL_MEAN:
-  - 127.5
-  - 127.5
-  - 127.5
-  PIXEL_STD:
-  - 127.5
-  - 127.5
-  - 127.5
-  PROPOSAL_GENERATOR:
-    MIN_SIZE: 0
-    NAME: RPN
-  RESNETS:
-    DEFORM_MODULATED: false
-    DEFORM_NUM_GROUPS: 1
-    DEFORM_ON_PER_STAGE:
-    - false
-    - false
-    - false
-    - false
-    DEPTH: 50
-    NORM: FrozenBN
-    NUM_GROUPS: 1
-    OUT_FEATURES:
-    - res4
-    RES2_OUT_CHANNELS: 256
-    RES5_DILATION: 1
-    STEM_OUT_CHANNELS: 64
-    STRIDE_IN_1X1: true
-    WIDTH_PER_GROUP: 64
-  RETINANET:
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_WEIGHTS:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    FOCAL_LOSS_ALPHA: 0.25
-    FOCAL_LOSS_GAMMA: 2.0
-    IN_FEATURES:
-    - p3
-    - p4
-    - p5
-    - p6
-    - p7
-    IOU_LABELS:
-    - 0
-    - -1
-    - 1
-    IOU_THRESHOLDS:
-    - 0.4
-    - 0.5
-    NMS_THRESH_TEST: 0.5
-    NORM: ''
-    NUM_CLASSES: 10
-    NUM_CONVS: 4
-    PRIOR_PROB: 0.01
-    SCORE_THRESH_TEST: 0.05
-    SMOOTH_L1_LOSS_BETA: 0.1
-    TOPK_CANDIDATES_TEST: 1000
-  ROI_BOX_CASCADE_HEAD:
-    BBOX_REG_WEIGHTS:
-    - - 10.0
-      - 10.0
-      - 5.0
-      - 5.0
-    - - 20.0
-      - 20.0
-      - 10.0
-      - 10.0
-    - - 30.0
-      - 30.0
-      - 15.0
-      - 15.0
-    IOUS:
-    - 0.5
-    - 0.6
-    - 0.7
-  ROI_BOX_HEAD:
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_LOSS_WEIGHT: 1.0
-    BBOX_REG_WEIGHTS:
-    - 10.0
-    - 10.0
-    - 5.0
-    - 5.0
-    CLS_AGNOSTIC_BBOX_REG: true
-    CONV_DIM: 256
-    FC_DIM: 1024
-    NAME: FastRCNNConvFCHead
-    NORM: ''
-    NUM_CONV: 0
-    NUM_FC: 2
-    POOLER_RESOLUTION: 7
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-    SMOOTH_L1_BETA: 0.0
-    TRAIN_ON_PRED_BOXES: false
-  ROI_HEADS:
-    BATCH_SIZE_PER_IMAGE: 512
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    IOU_LABELS:
-    - 0
-    - 1
-    IOU_THRESHOLDS:
-    - 0.5
-    NAME: CascadeROIHeads
-    NMS_THRESH_TEST: 0.5
-    NUM_CLASSES: 10
-    POSITIVE_FRACTION: 0.25
-    PROPOSAL_APPEND_GT: true
-    SCORE_THRESH_TEST: 0.05
-  ROI_KEYPOINT_HEAD:
-    CONV_DIMS:
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    LOSS_WEIGHT: 1.0
-    MIN_KEYPOINTS_PER_IMAGE: 1
-    NAME: KRCNNConvDeconvUpsampleHead
-    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
-    NUM_KEYPOINTS: 17
-    POOLER_RESOLUTION: 14
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-  ROI_MASK_HEAD:
-    CLS_AGNOSTIC_MASK: false
-    CONV_DIM: 256
-    NAME: MaskRCNNConvUpsampleHead
-    NORM: ''
-    NUM_CONV: 4
-    POOLER_RESOLUTION: 14
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-  RPN:
-    BATCH_SIZE_PER_IMAGE: 256
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_LOSS_WEIGHT: 1.0
-    BBOX_REG_WEIGHTS:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    BOUNDARY_THRESH: -1
-    CONV_DIMS:
-    - -1
-    HEAD_NAME: StandardRPNHead
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    - p6
-    IOU_LABELS:
-    - 0
-    - -1
-    - 1
-    IOU_THRESHOLDS:
-    - 0.3
-    - 0.7
-    LOSS_WEIGHT: 1.0
-    NMS_THRESH: 0.7
-    POSITIVE_FRACTION: 0.5
-    POST_NMS_TOPK_TEST: 1000
-    POST_NMS_TOPK_TRAIN: 2000
-    PRE_NMS_TOPK_TEST: 1000
-    PRE_NMS_TOPK_TRAIN: 2000
-    SMOOTH_L1_BETA: 0.0
-  SEM_SEG_HEAD:
-    COMMON_STRIDE: 4
-    CONVS_DIM: 128
-    IGNORE_VALUE: 255
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    LOSS_WEIGHT: 1.0
-    NAME: SemSegFPNHead
-    NORM: GN
-    NUM_CLASSES: 10
-  VIT:
-    DROP_PATH: 0.1
-    IMG_SIZE:
-    - 224
-    - 224
-    NAME: layoutlmv3_base
-    OUT_FEATURES:
-    - layer3
-    - layer5
-    - layer7
-    - layer11
-    POS_TYPE: abs
-  WEIGHTS: 
-OUTPUT_DIR: 
-SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
-SEED: 42
-SOLVER:
-  AMP:
-    ENABLED: true
-  BACKBONE_MULTIPLIER: 1.0
-  BASE_LR: 0.0002
-  BIAS_LR_FACTOR: 1.0
-  CHECKPOINT_PERIOD: 2000
-  CLIP_GRADIENTS:
-    CLIP_TYPE: full_model
-    CLIP_VALUE: 1.0
-    ENABLED: true
-    NORM_TYPE: 2.0
-  GAMMA: 0.1
-  GRADIENT_ACCUMULATION_STEPS: 1
-  IMS_PER_BATCH: 32
-  LR_SCHEDULER_NAME: WarmupCosineLR
-  MAX_ITER: 20000
-  MOMENTUM: 0.9
-  NESTEROV: false
-  OPTIMIZER: ADAMW
-  REFERENCE_WORLD_SIZE: 0
-  STEPS:
-  - 10000
-  WARMUP_FACTOR: 0.01
-  WARMUP_ITERS: 333
-  WARMUP_METHOD: linear
-  WEIGHT_DECAY: 0.05
-  WEIGHT_DECAY_BIAS: null
-  WEIGHT_DECAY_NORM: 0.0
-TEST:
-  AUG:
-    ENABLED: false
-    FLIP: true
-    MAX_SIZE: 4000
-    MIN_SIZES:
-    - 400
-    - 500
-    - 600
-    - 700
-    - 800
-    - 900
-    - 1000
-    - 1100
-    - 1200
-  DETECTIONS_PER_IMAGE: 100
-  EVAL_PERIOD: 1000
-  EXPECTED_RESULTS: []
-  KEYPOINT_OKS_SIGMAS: []
-  PRECISE_BN:
-    ENABLED: false
-    NUM_ITER: 200
-VERSION: 2
-VIS_PERIOD: 0
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
-config:
-  device: cpu
-  layout: True
-  formula: False
-  table_config:
-    model: TableMaster
-    is_table_recog_enable: False
-    max_time: 400
-
-weights:
-  layout: Layout/model_final.pth
-  mfd: MFD/weights.pt
-  mfr: MFR/UniMERNet
-  struct_eqtable: TabRec/StructEqTable
-  TableMaster: TabRec/TableMaster
--- a/magic_pdf/rw/AbsReaderWriter.py
+++ b/magic_pdf/rw/AbsReaderWriter.py
-from abc import ABC, abstractmethod
-
-
-class AbsReaderWriter(ABC):
-    MODE_TXT = "text"
-    MODE_BIN = "binary"
-    @abstractmethod
-    def read(self, path: str, mode=MODE_TXT):
-        raise NotImplementedError
-
-    @abstractmethod
-    def write(self, content: str, path: str, mode=MODE_TXT):
-        raise NotImplementedError
-
-    @abstractmethod
-    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
-        raise NotImplementedError
--- a/magic_pdf/rw/DiskReaderWriter.py
+++ b/magic_pdf/rw/DiskReaderWriter.py
-import os
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from loguru import logger
-
-
-class DiskReaderWriter(AbsReaderWriter):
-    def __init__(self, parent_path, encoding="utf-8"):
-        self.path = parent_path
-        self.encoding = encoding
-
-    def read(self, path, mode=AbsReaderWriter.MODE_TXT):
-        if os.path.isabs(path):
-            abspath = path
-        else:
-            abspath = os.path.join(self.path, path)
-        if not os.path.exists(abspath):
-            logger.error(f"file {abspath} not exists")
-            raise Exception(f"file {abspath} no exists")
-        if mode == AbsReaderWriter.MODE_TXT:
-            with open(abspath, "r", encoding=self.encoding) as f:
-                return f.read()
-        elif mode == AbsReaderWriter.MODE_BIN:
-            with open(abspath, "rb") as f:
-                return f.read()
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-
-    def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
-        if os.path.isabs(path):
-            abspath = path
-        else:
-            abspath = os.path.join(self.path, path)
-        directory_path = os.path.dirname(abspath)
-        if not os.path.exists(directory_path):
-            os.makedirs(directory_path)
-        if mode == AbsReaderWriter.MODE_TXT:
-            with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
-                f.write(content)
-
-        elif mode == AbsReaderWriter.MODE_BIN:
-            with open(abspath, "wb") as f:
-                f.write(content)
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-
-    def read_offset(self, path: str, offset=0, limit=None):
-        abspath = path
-        if not os.path.isabs(path):
-            abspath = os.path.join(self.path, path)
-        with open(abspath, "rb") as f:
-            f.seek(offset)
-            return f.read(limit)
-
-
-if __name__ == "__main__":
-    if 0:
-        file_path = "io/test/example.txt"
-        drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
-
-        # 写入内容到文件
-        drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
-
-        # 从文件读取内容
-        content = drw.read(path=file_path)
-        if content:
-            logger.info(f"从 {file_path} 读取的内容: {content}")
-    if 1:
-        drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
-        content_bin = drw.read_offset("1.txt")
-        assert content_bin == b"ABCD!"
-
-        content_bin = drw.read_offset("1.txt", offset=1, limit=2)
-        assert content_bin == b"BC"
-
--- a/magic_pdf/rw/S3ReaderWriter.py
+++ b/magic_pdf/rw/S3ReaderWriter.py
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
-import boto3
-from loguru import logger
-from botocore.config import Config
-
-
-class S3ReaderWriter(AbsReaderWriter):
-    def __init__(
-        self,
-        ak: str,
-        sk: str,
-        endpoint_url: str,
-        addressing_style: str = "auto",
-        parent_path: str = "",
-    ):
-        self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
-        self.path = parent_path
-
-    def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
-        s3_client = boto3.client(
-            service_name="s3",
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=endpoint_url,
-            config=Config(
-                s3={"addressing_style": addressing_style},
-                retries={"max_attempts": 5, "mode": "standard"},
-            ),
-        )
-        return s3_client
-
-    def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
-        if s3_relative_path.startswith("s3://"):
-            s3_path = s3_relative_path
-        else:
-            s3_path = join_path(self.path, s3_relative_path)
-        bucket_name, key = parse_bucket_key(s3_path)
-        res = self.client.get_object(Bucket=bucket_name, Key=key)
-        body = res["Body"].read()
-        if mode == AbsReaderWriter.MODE_TXT:
-            data = body.decode(encoding)  # Decode bytes to text
-        elif mode == AbsReaderWriter.MODE_BIN:
-            data = body
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-        return data
-
-    def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
-        if s3_relative_path.startswith("s3://"):
-            s3_path = s3_relative_path
-        else:
-            s3_path = join_path(self.path, s3_relative_path)
-        if mode == AbsReaderWriter.MODE_TXT:
-            body = content.encode(encoding)  # Encode text data as bytes
-        elif mode == AbsReaderWriter.MODE_BIN:
-            body = content
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-        bucket_name, key = parse_bucket_key(s3_path)
-        self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
-        logger.info(f"内容已写入 {s3_path} ")
-
-    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
-        if path.startswith("s3://"):
-            s3_path = path
-        else:
-            s3_path = join_path(self.path, path)
-        bucket_name, key = parse_bucket_key(s3_path)
-
-        range_header = (
-            f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
-        )
-        res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
-        return res["Body"].read()
-
-
-if __name__ == "__main__":
-    if 0:
-        # Config the connection info
-        ak = ""
-        sk = ""
-        endpoint_url = ""
-        addressing_style = "auto"
-        bucket_name = ""
-        # Create an S3ReaderWriter object
-        s3_reader_writer = S3ReaderWriter(
-            ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
-        )
-
-        # Write text data to S3
-        text_data = "This is some text data"
-        s3_reader_writer.write(
-            text_data,
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
-            mode=AbsReaderWriter.MODE_TXT,
-        )
-
-        # Read text data from S3
-        text_data_read = s3_reader_writer.read(
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
-        )
-        logger.info(f"Read text data from S3: {text_data_read}")
-        # Write binary data to S3
-        binary_data = b"This is some binary data"
-        s3_reader_writer.write(
-            text_data,
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
-            mode=AbsReaderWriter.MODE_BIN,
-        )
-
-        # Read binary data from S3
-        binary_data_read = s3_reader_writer.read(
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
-        )
-        logger.info(f"Read binary data from S3: {binary_data_read}")
-
-        # Range Read text data from S3
-        binary_data_read = s3_reader_writer.read_offset(
-            path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
-        )
-        logger.info(f"Read binary data from S3: {binary_data_read}")
-    if 1:
-        import os
-        import json
-
-        ak = os.getenv("AK", "")
-        sk = os.getenv("SK", "")
-        endpoint_url = os.getenv("ENDPOINT", "")
-        bucket = os.getenv("S3_BUCKET", "")
-        prefix = os.getenv("S3_PREFIX", "")
-        key_basename = os.getenv("S3_KEY_BASENAME", "")
-        s3_reader_writer = S3ReaderWriter(
-            ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
-        )
-        content_bin = s3_reader_writer.read_offset(key_basename)
-        assert content_bin[:10] == b'{"track_id'
-        assert content_bin[-10:] == b'r":null}}\n'
-
-        content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
-        jso = json.dumps(content_bin.decode("utf-8"))
-        print(jso)
--- a/magic_pdf/rw/__init__.py
+++ b/magic_pdf/rw/__init__.py
--- a/magic_pdf/spark/__init__.py
+++ b/magic_pdf/spark/__init__.py
--- a/magic_pdf/spark/spark_api.py
+++ b/magic_pdf/spark/spark_api.py
-from loguru import logger
-
-from magic_pdf.libs.drop_reason import DropReason
-
-
-def get_data_source(jso: dict):
-    data_source = jso.get("data_source")
-    if data_source is None:
-        data_source = jso.get("file_source")
-    return data_source
-
-
-def get_data_type(jso: dict):
-    data_type = jso.get("data_type")
-    if data_type is None:
-        data_type = jso.get("file_type")
-    return data_type
-
-
-def get_bookid(jso: dict):
-    book_id = jso.get("bookid")
-    if book_id is None:
-        book_id = jso.get("original_file_id")
-    return book_id
-
-
-def exception_handler(jso: dict, e):
-    logger.exception(e)
-    jso["_need_drop"] = True
-    jso["_drop_reason"] = DropReason.Exception
-    jso["_exception"] = f"ERROR: {e}"
-    return jso
-
-
-def get_bookname(jso: dict):
-    data_source = get_data_source(jso)
-    file_id = jso.get("file_id")
-    book_name = f"{data_source}/{file_id}"
-    return book_name
-
-
-def spark_json_extractor(jso: dict) -> dict:
-
-    """
-    从json中提取数据，返回一个dict
-    """
-
-    return {
-        "_pdf_type": jso["_pdf_type"],
-        "model_list": jso["doc_layout_result"],
-    }
--- a/magic_pdf/tools/__init__.py
+++ b/magic_pdf/tools/__init__.py
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
-import os
-from pathlib import Path
-
-import click
-from loguru import logger
-
-import magic_pdf.model as model_config
-from magic_pdf.libs.version import __version__
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.tools.common import do_parse, parse_pdf_methods
-
-
-@click.command()
-@click.version_option(__version__,
-                      '--version',
-                      '-v',
-                      help='display the version and exit')
-@click.option(
-    '-p',
-    '--path',
-    'path',
-    type=click.Path(exists=True),
-    required=True,
-    help='local pdf filepath or directory',
-)
-@click.option(
-    '-o',
-    '--output-dir',
-    'output_dir',
-    type=click.Path(),
-    required=True,
-    help='output local directory',
-)
-@click.option(
-    '-m',
-    '--method',
-    'method',
-    type=parse_pdf_methods,
-    help="""the method for parsing pdf.
-ocr: using ocr technique to extract information from pdf.
-txt: suitable for the text-based pdf only and outperform ocr.
-auto: automatically choose the best method for parsing pdf from ocr and txt.
-without method specified, auto will be used by default.""",
-    default='auto',
-)
-@click.option(
-    '-d',
-    '--debug',
-    'debug_able',
-    type=bool,
-    help='Enables detailed debugging information during the execution of the CLI commands.',
-    default=False,
-)
-@click.option(
-    '-s',
-    '--start',
-    'start_page_id',
-    type=int,
-    help='The starting page for PDF parsing, beginning from 0.',
-    default=0,
-)
-@click.option(
-    '-e',
-    '--end',
-    'end_page_id',
-    type=int,
-    help='The ending page for PDF parsing, beginning from 0.',
-    default=None,
-)
-def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
-    model_config.__use_inside_model__ = True
-    model_config.__model_mode__ = 'full'
-    os.makedirs(output_dir, exist_ok=True)
-
-    def read_fn(path):
-        disk_rw = DiskReaderWriter(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
-
-    def parse_doc(doc_path: str):
-        try:
-            file_name = str(Path(doc_path).stem)
-            pdf_data = read_fn(doc_path)
-            do_parse(
-                output_dir,
-                file_name,
-                pdf_data,
-                [],
-                method,
-                debug_able,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-            )
-
-        except Exception as e:
-            logger.exception(e)
-
-    if os.path.isdir(path):
-        for root, dirs, files in os.walk(path):
-            # 查找所有的pdf文件
-            for file in files:
-                if file.endswith('.pdf'):
-                    # 打印pdf文件的完整路径
-                    doc_path = os.path.join(root, file)
-                    logger.info(f'正在解析：{doc_path}')
-                    parse_doc(doc_path)
-    else:
-        #logger.info(f'正在解析：{doc_path}')
-        parse_doc(path)
-
-
-if __name__ == '__main__':
-    cli()
-