Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files

Update magic_pdf/init.py, magic_pdf/config.ini, magic_pdf/tmp.py,...
Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
2df265c8 · zhougaofeng · 826086d2 · 2df265c8 · 2df265c8 · 2df265c8
Commit 2df265c8 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/pre_proc/resolve_bbox_conflict.py
+++ b/magic_pdf/pre_proc/resolve_bbox_conflict.py
+"""
+从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
+1. 首先去掉出现在图片上的bbox，图片包括表格和图片
+2. 然后去掉出现在文字blcok上的图片bbox
+"""
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
+from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
+def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
+                                  text_raw_blocks: list):
+    """
+    text_raw_blocks结构是从pymupdf里直接取到的结构，具体样例参考test/assets/papre/pymu_textblocks.json
+    当下采用一种粗暴的方式：
+    1. 去掉图片上的公式
+    2. 去掉table上的公式
+    2. 图片和文字block部分重叠，首先丢弃图片
+    3. 图片和图片重叠，修改图片的bbox，使得图片不重叠(暂时没这么做，先把图片都扔掉)
+    4. 去掉文字bbox里位于图片、表格上的文字（一定要完全在图、表内部）
+    5. 去掉表格上的文字
+    """
+    text_block_removed = []
+    images_backup = []
+    # 去掉位于图片上的文字block
+    for image_box in images:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in(text_bbox, image_box):
+                text_block['tag'] = ON_IMAGE_TEXT
+                text_block_removed.append(text_block)
+    # 去掉table上的文字block
+    for table_box in tables:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in(text_bbox, table_box):
+                text_block['tag'] = ON_TABLE_TEXT
+                text_block_removed.append(text_block)
+    for text_block in text_block_removed:
+        if text_block in text_raw_blocks:
+            text_raw_blocks.remove(text_block)
+    # 第一步去掉在图片上出现的公式box
+    temp = []
+    for image_box in images:
+        for eq1 in interline_equations:
+            if _is_in_or_part_overlap(image_box, eq1[:4]):
+                temp.append(eq1)
+        for eq2 in inline_equations:
+            if _is_in_or_part_overlap(image_box, eq2[:4]):
+                temp.append(eq2)
+    for eq in temp:
+        if eq in interline_equations:
+            interline_equations.remove(eq)
+        if eq in inline_equations:
+            inline_equations.remove(eq)
+    # 第二步去掉在表格上出现的公式box
+    temp = []
+    for table_box in tables:
+        for eq1 in interline_equations:
+            if _is_in_or_part_overlap(table_box, eq1[:4]):
+                temp.append(eq1)
+        for eq2 in inline_equations:
+            if _is_in_or_part_overlap(table_box, eq2[:4]):
+                temp.append(eq2)
+    for eq in temp:
+        if eq in interline_equations:
+            interline_equations.remove(eq)
+        if eq in inline_equations:
+            inline_equations.remove(eq)
+    # 图片和文字重叠，丢掉图片
+    for image_box in images:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in_or_part_overlap(image_box, text_bbox):
+                images_backup.append(image_box)
+                break
+    for image_box in images_backup:
+        images.remove(image_box)
+    # 图片和图片重叠，两张都暂时不参与版面计算
+    images_dup_index = []
+    for i in range(len(images)):
+        for j in range(i + 1, len(images)):
+            if _is_in_or_part_overlap(images[i], images[j]):
+                images_dup_index.append(i)
+                images_dup_index.append(j)
+    dup_idx = set(images_dup_index)
+    for img_id in dup_idx:
+        images_backup.append(images[img_id])
+        images[img_id] = None
+    images = [img for img in images if img is not None]
+    # 如果行间公式和文字block重叠，放到临时的数据里，防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
+    # 对于这样的文本块删除，然后保留行间公式的大小不变。
+    # 当计算完毕layout，这部分再合并回来
+    text_block_removed_2 = []
+    # for text_block in text_raw_blocks:
+    #     text_bbox = text_block["bbox"]
+    #     for eq in interline_equations:
+    #         ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
+    #         if ratio>0.05:
+    #             text_block['tag'] = "belong-to-interline-equation"
+    #             text_block_removed_2.append(text_block)
+    #             break
+    # for tb in text_block_removed_2:
+    #     if tb in text_raw_blocks:
+    #         text_raw_blocks.remove(tb)
+    # text_block_removed = text_block_removed + text_block_removed_2
+    return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
+def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
+    """
+    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
+    因为这种情况大概率发生了公式没有被检测出来。
+    """
+    if len(text_blocks) == 0:
+        return False
+    page_min_y = 0
+    page_max_y = max(yy['bbox'][3] for yy in text_blocks)
+    def __max_y(lst: list):
+        if len(lst) > 0:
+            return max([item[1] for item in lst])
+        return page_min_y
+    def __min_y(lst: list):
+        if len(lst) > 0:
+            return min([item[3] for item in lst])
+        return page_max_y
+    clip_y0 = __max_y(header)
+    clip_y1 = __min_y(footer)
+    txt_bboxes = []
+    for text_block in text_blocks:
+        bbox = text_block["bbox"]
+        if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
+            txt_bboxes.append(bbox)
+    for i in range(len(txt_bboxes)):
+        for j in range(i + 1, len(txt_bboxes)):
+            if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
+                return True
+    return False
+def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
+    """
+    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
+    因为这种情况大概率发生了公式没有被检测出来。
+    """
+    if len(useful_blocks) == 0:
+        return False
+    page_min_y = 0
+    page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
+    useful_bboxes = []
+    for text_block in useful_blocks:
+        bbox = text_block["bbox"]
+        if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
+            useful_bboxes.append(bbox)
+    for i in range(len(useful_bboxes)):
+        for j in range(i + 1, len(useful_bboxes)):
+            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
+            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
+            if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
+                if area_i > area_j:
+                    return True, useful_bboxes[j], useful_bboxes[i]
+                else:
+                    return True, useful_bboxes[i], useful_bboxes[j]
+    return False, None, None
--- a/magic_pdf/pre_proc/solve_line_alien.py
+++ b/magic_pdf/pre_proc/solve_line_alien.py
+def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict:  # text_block -> json中的preproc_block
+    """解决行内文本间距过大问题"""
+    for i in range(len(pdf_info_dict)):
+        text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
+        for block in text_blocks:
+            x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
+            for line in block['lines']:
+                x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
+                # line_box = [x1, y1, x2, y2] 
+                if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
+                    # if len(line['spans']) == 1:
+                    line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
+                x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] 
+    return pdf_info_dict
--- a/magic_pdf/pre_proc/statistics.py
+++ b/magic_pdf/pre_proc/statistics.py
+"""
+统计处需要跨页、全局性的数据
+- 统计出字号从大到小
+- 正文区域占比最高的前5
+- 正文平均行间距
+- 正文平均字间距
+- 正文平均字符宽度
+- 正文平均字符高度
+"""
--- a/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
+++ b/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
--- a/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+model:
+  arch: unimernet
+  model_type: unimernet
+  model_config:
+    model_name: ./models/unimernet_base
+    max_seq_len: 1536
+  load_pretrained: True
+  pretrained: './models/unimernet_base/pytorch_model.pth'
+  tokenizer_config:
+    path: ./models/unimernet_base
+datasets:
+  formula_rec_eval:
+    vis_processor:
+      eval:
+        name: "formula_image_eval"
+        image_size:
+          - 192
+          - 672
+run:
+  runner: runner_iter
+  task: unimernet_train
+  batch_size_train: 64
+  batch_size_eval: 64
+  num_workers: 1
+  iters_per_inner_epoch: 2000
+  max_iters: 60000
+  seed: 42
+  output_dir: "../output/demo"
+  evaluate: True
+  test_splits: [ "eval" ]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  distributed_type: ddp  # or fsdp when train llm
+  generate_cfg:
+    temperature: 0.0
\ No newline at end of file
--- a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+AUG:
+  DETR: true
+CACHE_DIR: ~/cache/huggingface
+CUDNN_BENCHMARK: false
+DATALOADER:
+  ASPECT_RATIO_GROUPING: true
+  FILTER_EMPTY_ANNOTATIONS: false
+  NUM_WORKERS: 4
+  REPEAT_THRESHOLD: 0.0
+  SAMPLER_TRAIN: TrainingSampler
+DATASETS:
+  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
+  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
+  PROPOSAL_FILES_TEST: []
+  PROPOSAL_FILES_TRAIN: []
+  TEST:
+  - scihub_train
+  TRAIN:
+  - scihub_train
+GLOBAL:
+  HACK: 1.0
+ICDAR_DATA_DIR_TEST: ''
+ICDAR_DATA_DIR_TRAIN: ''
+INPUT:
+  CROP:
+    ENABLED: true
+    SIZE:
+    - 384
+    - 600
+    TYPE: absolute_range
+  FORMAT: RGB
+  MASK_FORMAT: polygon
+  MAX_SIZE_TEST: 1333
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MIN_SIZE_TRAIN:
+  - 480
+  - 512
+  - 544
+  - 576
+  - 608
+  - 640
+  - 672
+  - 704
+  - 736
+  - 768
+  - 800
+  MIN_SIZE_TRAIN_SAMPLING: choice
+  RANDOM_FLIP: horizontal
+MODEL:
+  ANCHOR_GENERATOR:
+    ANGLES:
+    - - -90
+      - 0
+      - 90
+    ASPECT_RATIOS:
+    - - 0.5
+      - 1.0
+      - 2.0
+    NAME: DefaultAnchorGenerator
+    OFFSET: 0.0
+    SIZES:
+    - - 32
+    - - 64
+    - - 128
+    - - 256
+    - - 512
+  BACKBONE:
+    FREEZE_AT: 2
+    NAME: build_vit_fpn_backbone
+  CONFIG_PATH: ''
+  DEVICE: cuda
+  FPN:
+    FUSE_TYPE: sum
+    IN_FEATURES:
+    - layer3
+    - layer5
+    - layer7
+    - layer11
+    NORM: ''
+    OUT_CHANNELS: 256
+  IMAGE_ONLY: true
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  MASK_ON: true
+  META_ARCHITECTURE: VLGeneralizedRCNN
+  PANOPTIC_FPN:
+    COMBINE:
+      ENABLED: true
+      INSTANCES_CONFIDENCE_THRESH: 0.5
+      OVERLAP_THRESH: 0.5
+      STUFF_AREA_LIMIT: 4096
+    INSTANCE_LOSS_WEIGHT: 1.0
+  PIXEL_MEAN:
+  - 127.5
+  - 127.5
+  - 127.5
+  PIXEL_STD:
+  - 127.5
+  - 127.5
+  - 127.5
+  PROPOSAL_GENERATOR:
+    MIN_SIZE: 0
+    NAME: RPN
+  RESNETS:
+    DEFORM_MODULATED: false
+    DEFORM_NUM_GROUPS: 1
+    DEFORM_ON_PER_STAGE:
+    - false
+    - false
+    - false
+    - false
+    DEPTH: 50
+    NORM: FrozenBN
+    NUM_GROUPS: 1
+    OUT_FEATURES:
+    - res4
+    RES2_OUT_CHANNELS: 256
+    RES5_DILATION: 1
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: true
+    WIDTH_PER_GROUP: 64
+  RETINANET:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_WEIGHTS:
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    FOCAL_LOSS_ALPHA: 0.25
+    FOCAL_LOSS_GAMMA: 2.0
+    IN_FEATURES:
+    - p3
+    - p4
+    - p5
+    - p6
+    - p7
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.4
+    - 0.5
+    NMS_THRESH_TEST: 0.5
+    NORM: ''
+    NUM_CLASSES: 10
+    NUM_CONVS: 4
+    PRIOR_PROB: 0.01
+    SCORE_THRESH_TEST: 0.05
+    SMOOTH_L1_LOSS_BETA: 0.1
+    TOPK_CANDIDATES_TEST: 1000
+  ROI_BOX_CASCADE_HEAD:
+    BBOX_REG_WEIGHTS:
+    - - 10.0
+      - 10.0
+      - 5.0
+      - 5.0
+    - - 20.0
+      - 20.0
+      - 10.0
+      - 10.0
+    - - 30.0
+      - 30.0
+      - 15.0
+      - 15.0
+    IOUS:
+    - 0.5
+    - 0.6
+    - 0.7
+  ROI_BOX_HEAD:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS:
+    - 10.0
+    - 10.0
+    - 5.0
+    - 5.0
+    CLS_AGNOSTIC_BBOX_REG: true
+    CONV_DIM: 256
+    FC_DIM: 1024
+    NAME: FastRCNNConvFCHead
+    NORM: ''
+    NUM_CONV: 0
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+    SMOOTH_L1_BETA: 0.0
+    TRAIN_ON_PRED_BOXES: false
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 512
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    IOU_LABELS:
+    - 0
+    - 1
+    IOU_THRESHOLDS:
+    - 0.5
+    NAME: CascadeROIHeads
+    NMS_THRESH_TEST: 0.5
+    NUM_CLASSES: 10
+    POSITIVE_FRACTION: 0.25
+    PROPOSAL_APPEND_GT: true
+    SCORE_THRESH_TEST: 0.05
+  ROI_KEYPOINT_HEAD:
+    CONV_DIMS:
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    LOSS_WEIGHT: 1.0
+    MIN_KEYPOINTS_PER_IMAGE: 1
+    NAME: KRCNNConvDeconvUpsampleHead
+    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
+    NUM_KEYPOINTS: 17
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  ROI_MASK_HEAD:
+    CLS_AGNOSTIC_MASK: false
+    CONV_DIM: 256
+    NAME: MaskRCNNConvUpsampleHead
+    NORM: ''
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  RPN:
+    BATCH_SIZE_PER_IMAGE: 256
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS:
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    BOUNDARY_THRESH: -1
+    CONV_DIMS:
+    - -1
+    HEAD_NAME: StandardRPNHead
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    - p6
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.3
+    - 0.7
+    LOSS_WEIGHT: 1.0
+    NMS_THRESH: 0.7
+    POSITIVE_FRACTION: 0.5
+    POST_NMS_TOPK_TEST: 1000
+    POST_NMS_TOPK_TRAIN: 2000
+    PRE_NMS_TOPK_TEST: 1000
+    PRE_NMS_TOPK_TRAIN: 2000
+    SMOOTH_L1_BETA: 0.0
+  SEM_SEG_HEAD:
+    COMMON_STRIDE: 4
+    CONVS_DIM: 128
+    IGNORE_VALUE: 255
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    LOSS_WEIGHT: 1.0
+    NAME: SemSegFPNHead
+    NORM: GN
+    NUM_CLASSES: 10
+  VIT:
+    DROP_PATH: 0.1
+    IMG_SIZE:
+    - 224
+    - 224
+    NAME: layoutlmv3_base
+    OUT_FEATURES:
+    - layer3
+    - layer5
+    - layer7
+    - layer11
+    POS_TYPE: abs
+  WEIGHTS: 
+OUTPUT_DIR: 
+SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
+SEED: 42
+SOLVER:
+  AMP:
+    ENABLED: true
+  BACKBONE_MULTIPLIER: 1.0
+  BASE_LR: 0.0002
+  BIAS_LR_FACTOR: 1.0
+  CHECKPOINT_PERIOD: 2000
+  CLIP_GRADIENTS:
+    CLIP_TYPE: full_model
+    CLIP_VALUE: 1.0
+    ENABLED: true
+    NORM_TYPE: 2.0
+  GAMMA: 0.1
+  GRADIENT_ACCUMULATION_STEPS: 1
+  IMS_PER_BATCH: 32
+  LR_SCHEDULER_NAME: WarmupCosineLR
+  MAX_ITER: 20000
+  MOMENTUM: 0.9
+  NESTEROV: false
+  OPTIMIZER: ADAMW
+  REFERENCE_WORLD_SIZE: 0
+  STEPS:
+  - 10000
+  WARMUP_FACTOR: 0.01
+  WARMUP_ITERS: 333
+  WARMUP_METHOD: linear
+  WEIGHT_DECAY: 0.05
+  WEIGHT_DECAY_BIAS: null
+  WEIGHT_DECAY_NORM: 0.0
+TEST:
+  AUG:
+    ENABLED: false
+    FLIP: true
+    MAX_SIZE: 4000
+    MIN_SIZES:
+    - 400
+    - 500
+    - 600
+    - 700
+    - 800
+    - 900
+    - 1000
+    - 1100
+    - 1200
+  DETECTIONS_PER_IMAGE: 100
+  EVAL_PERIOD: 1000
+  EXPECTED_RESULTS: []
+  KEYPOINT_OKS_SIGMAS: []
+  PRECISE_BN:
+    ENABLED: false
+    NUM_ITER: 200
+VERSION: 2
+VIS_PERIOD: 0
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
+weights:
+  layoutlmv3: Layout/LayoutLMv3/model_final.pth
+  doclayout_yolo: Layout/YOLO/doclayout_yolo_ft.pt
+  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
+  unimernet_small: MFR/unimernet_small
+  struct_eqtable: TabRec/StructEqTable
+  tablemaster: TabRec/TableMaster
\ No newline at end of file
--- a/magic_pdf/rw/AbsReaderWriter.py
+++ b/magic_pdf/rw/AbsReaderWriter.py
+from abc import ABC, abstractmethod
+class AbsReaderWriter(ABC):
+    MODE_TXT = "text"
+    MODE_BIN = "binary"
+    @abstractmethod
+    def read(self, path: str, mode=MODE_TXT):
+        raise NotImplementedError
+    @abstractmethod
+    def write(self, content: str, path: str, mode=MODE_TXT):
+        raise NotImplementedError
+    @abstractmethod
+    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
+        raise NotImplementedError
--- a/magic_pdf/rw/DiskReaderWriter.py
+++ b/magic_pdf/rw/DiskReaderWriter.py
+import os
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from loguru import logger
+class DiskReaderWriter(AbsReaderWriter):
+    def __init__(self, parent_path, encoding="utf-8"):
+        self.path = parent_path
+        self.encoding = encoding
+    def read(self, path, mode=AbsReaderWriter.MODE_TXT):
+        if os.path.isabs(path):
+            abspath = path
+        else:
+            abspath = os.path.join(self.path, path)
+        if not os.path.exists(abspath):
+            logger.error(f"file {abspath} not exists")
+            raise Exception(f"file {abspath} no exists")
+        if mode == AbsReaderWriter.MODE_TXT:
+            with open(abspath, "r", encoding=self.encoding) as f:
+                return f.read()
+        elif mode == AbsReaderWriter.MODE_BIN:
+            with open(abspath, "rb") as f:
+                return f.read()
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+    def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
+        if os.path.isabs(path):
+            abspath = path
+        else:
+            abspath = os.path.join(self.path, path)
+        directory_path = os.path.dirname(abspath)
+        if not os.path.exists(directory_path):
+            os.makedirs(directory_path)
+        if mode == AbsReaderWriter.MODE_TXT:
+            with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
+                f.write(content)
+        elif mode == AbsReaderWriter.MODE_BIN:
+            with open(abspath, "wb") as f:
+                f.write(content)
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+    def read_offset(self, path: str, offset=0, limit=None):
+        abspath = path
+        if not os.path.isabs(path):
+            abspath = os.path.join(self.path, path)
+        with open(abspath, "rb") as f:
+            f.seek(offset)
+            return f.read(limit)
+if __name__ == "__main__":
+    if 0:
+        file_path = "io/test/example.txt"
+        drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
+        # 写入内容到文件
+        drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
+        # 从文件读取内容
+        content = drw.read(path=file_path)
+        if content:
+            logger.info(f"从 {file_path} 读取的内容: {content}")
+    if 1:
+        drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
+        content_bin = drw.read_offset("1.txt")
+        assert content_bin == b"ABCD!"
+        content_bin = drw.read_offset("1.txt", offset=1, limit=2)
+        assert content_bin == b"BC"
--- a/magic_pdf/rw/S3ReaderWriter.py
+++ b/magic_pdf/rw/S3ReaderWriter.py
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
+import boto3
+from loguru import logger
+from botocore.config import Config
+class S3ReaderWriter(AbsReaderWriter):
+    def __init__(
+        self,
+        ak: str,
+        sk: str,
+        endpoint_url: str,
+        addressing_style: str = "auto",
+        parent_path: str = "",
+    ):
+        self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
+        self.path = parent_path
+    def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
+        s3_client = boto3.client(
+            service_name="s3",
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=endpoint_url,
+            config=Config(
+                s3={"addressing_style": addressing_style},
+                retries={"max_attempts": 5, "mode": "standard"},
+            ),
+        )
+        return s3_client
+    def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
+        if s3_relative_path.startswith("s3://"):
+            s3_path = s3_relative_path
+        else:
+            s3_path = join_path(self.path, s3_relative_path)
+        bucket_name, key = parse_bucket_key(s3_path)
+        res = self.client.get_object(Bucket=bucket_name, Key=key)
+        body = res["Body"].read()
+        if mode == AbsReaderWriter.MODE_TXT:
+            data = body.decode(encoding)  # Decode bytes to text
+        elif mode == AbsReaderWriter.MODE_BIN:
+            data = body
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+        return data
+    def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
+        if s3_relative_path.startswith("s3://"):
+            s3_path = s3_relative_path
+        else:
+            s3_path = join_path(self.path, s3_relative_path)
+        if mode == AbsReaderWriter.MODE_TXT:
+            body = content.encode(encoding)  # Encode text data as bytes
+        elif mode == AbsReaderWriter.MODE_BIN:
+            body = content
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+        bucket_name, key = parse_bucket_key(s3_path)
+        self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
+        logger.info(f"内容已写入 {s3_path} ")
+    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
+        if path.startswith("s3://"):
+            s3_path = path
+        else:
+            s3_path = join_path(self.path, path)
+        bucket_name, key = parse_bucket_key(s3_path)
+        range_header = (
+            f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
+        )
+        res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
+        return res["Body"].read()
+if __name__ == "__main__":
+    if 0:
+        # Config the connection info
+        ak = ""
+        sk = ""
+        endpoint_url = ""
+        addressing_style = "auto"
+        bucket_name = ""
+        # Create an S3ReaderWriter object
+        s3_reader_writer = S3ReaderWriter(
+            ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
+        )
+        # Write text data to S3
+        text_data = "This is some text data"
+        s3_reader_writer.write(
+            text_data,
+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+        # Read text data from S3
+        text_data_read = s3_reader_writer.read(
+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
+        )
+        logger.info(f"Read text data from S3: {text_data_read}")
+        # Write binary data to S3
+        binary_data = b"This is some binary data"
+        s3_reader_writer.write(
+            text_data,
+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
+            mode=AbsReaderWriter.MODE_BIN,
+        )
+        # Read binary data from S3
+        binary_data_read = s3_reader_writer.read(
+            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
+        )
+        logger.info(f"Read binary data from S3: {binary_data_read}")
+        # Range Read text data from S3
+        binary_data_read = s3_reader_writer.read_offset(
+            path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
+        )
+        logger.info(f"Read binary data from S3: {binary_data_read}")
+    if 1:
+        import os
+        import json
+        ak = os.getenv("AK", "")
+        sk = os.getenv("SK", "")
+        endpoint_url = os.getenv("ENDPOINT", "")
+        bucket = os.getenv("S3_BUCKET", "")
+        prefix = os.getenv("S3_PREFIX", "")
+        key_basename = os.getenv("S3_KEY_BASENAME", "")
+        s3_reader_writer = S3ReaderWriter(
+            ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
+        )
+        content_bin = s3_reader_writer.read_offset(key_basename)
+        assert content_bin[:10] == b'{"track_id'
+        assert content_bin[-10:] == b'r":null}}\n'
+        content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
+        jso = json.dumps(content_bin.decode("utf-8"))
+        print(jso)
--- a/magic_pdf/rw/__init__.py
+++ b/magic_pdf/rw/__init__.py
--- a/magic_pdf/rw/draw_ofd.py
+++ b/magic_pdf/rw/draw_ofd.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: F:\code\easyofd\easyofd\draw
+# CREATE_TIME: 2023-10-26
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# note:  写入 xml 目录并打包成ofd 文件
+from datetime import datetime
+from io import BytesIO
+from typing import Optional
+from PIL import Image
+from loguru import logger
+from magic_pdf.rw.ofdtemplate import CurId, OFDTemplate, DocumentTemplate, DocumentResTemplate, PublicResTemplate, ContentTemplate, \
+    OFDStructure
+from magic_pdf.rw.pdf_parse import DPFParser
+class OFDWrite(object):
+    """
+    写入ofd 工具类
+    """
+    def __init__(self, ):
+        self.OP = 200 / 25.4
+        # self.OP = 1
+    def build_ofd_entrance(self, id_obj: Optional[CurId] = None):
+        """
+        build_ofd_entrance
+        """
+        CreationDate = str(datetime.now())
+        ofd_entrance = OFDTemplate(CreationDate=CreationDate, id_obj=id_obj)
+        return ofd_entrance
+    def build_document(self, img_len, id_obj: Optional[CurId] = None, PhysicalBox: Optional[str] = "0 0 140 90"):
+        """
+        build_document
+        """
+        pages = []
+        for idx in range(img_len):
+            pages.append(
+                {
+                    "@ID": f"{idx + 1}",
+                    "@BaseLoc": f"Pages/Page_{idx}/Content.xml"
+                }
+            )
+        document = DocumentTemplate(Page=pages, id_obj=id_obj, PhysicalBox=PhysicalBox)
+        return document
+    def build_document_res(self, img_len: int = 0, id_obj: Optional[CurId] = None,
+                           pfd_res_uuid_map: Optional[dict] = None):
+        """
+        build_document_res
+        """
+        MultiMedia = []
+        DrawParams = []  # todo DrawParams 参数后面有空增加
+        if img_len and not pfd_res_uuid_map:
+            for num in range(img_len):
+                MultiMedia.append({
+                    "@ID": 0,
+                    "@Type": "Image",
+                    "ofd:MediaFile": f"Image_{num}.jpg",
+                    "res_uuid": f"{num}",
+                })
+        elif pfd_res_uuid_map and (pfd_img := pfd_res_uuid_map.get("img")):
+            for res_uuid in pfd_img.keys():
+                name = f"Image_{res_uuid}.jpg"
+                MultiMedia.append({
+                    "@ID": 0,
+                    "@Type": "Image",
+                    "ofd:MediaFile": name,
+                    "res_uuid": res_uuid,
+                })
+        document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
+        return document_res
+    def build_public_res(self, id_obj: CurId = None, pfd_res_uuid_map: dict = None):
+        """
+        build_public_res
+        """
+        fonts = []
+        if pfd_res_uuid_map and (pfd_font := pfd_res_uuid_map.get("font")):
+            for res_uuid, font in pfd_font.items():
+                fonts.append({
+                    "@ID": 0,
+                    "@FontName": font,
+                    "@FamilyName": font,  # 匹配替代字型
+                    "res_uuid": res_uuid,
+                    "@FixedWidth": "false",
+                    "@Serif": "false",
+                    "@Bold": "false",
+                    "@Charset": "prc"
+                })
+        else:
+            pass
+        public_res = PublicResTemplate(Font=fonts, id_obj=id_obj)
+        return public_res
+    def build_content_res(self, pil_img_list=None, pdf_info_list=None, id_obj: CurId = None,
+                          pfd_res_uuid_map: dict = None):
+        """
+        pil_img_list - >一张图片是一页
+        content_res -> 写入 pdf 信息
+        """
+        PhysicalBox = None
+        content_res_list = []
+        if pil_img_list:
+            for idx, pil_img in enumerate(pil_img_list):
+                # print(pil_img)
+                # print(idx, pil_img[1], pil_img[2])
+                PhysicalBox = f"0 0 {pil_img[1]} {pil_img[2]}"
+                ImageObject = [{
+                    "@ID": 0,
+                    "@CTM": f"{pil_img[1]} 0 0 {pil_img[2]} 0 0",
+                    "@Boundary": f"0 0 {pil_img[1]} {pil_img[2]}",
+                    "res_uuid": f"{idx}",  # 资源标识
+                    "@ResourceID": f""
+                }]
+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
+                                         CGTransform=[], PathObject=[], TextObject=[], id_obj=id_obj)
+                # print(conten)
+                content_res_list.append(conten)
+        elif pdf_info_list:  # 写入读取后的pdf 结果 # todo 图片id 需要关联得提前定义或者有其他方式反向对齐
+            for idx, content in enumerate(pdf_info_list):
+                ImageObject = []
+                TextObject = []
+                PhysicalBox = pfd_res_uuid_map["other"]["page_size"][idx]
+                PhysicalBox = f"0 0 {PhysicalBox[0]} {PhysicalBox[1]}"  # page_size 没有的话使用document 里面的
+                for block in content:
+                    # print(block)
+                    bbox = block['bbox']
+                    x0, y0, length, height = bbox[0] / self.OP, bbox[1] / self.OP, (bbox[2] - bbox[0]) / self.OP, (
+                            bbox[3] - bbox[1]) / self.OP
+                    if block["type"] == "text":
+                        count = len(block.get("text"))
+                        TextObject.append({
+                            "@ID": 0,
+                            "res_uuid": block.get("res_uuid"),  # 资源标识
+                            "@Font": "",
+                            "ofd:FillColor": {"Value": "156 82 35"},
+                            "ofd:TextCode": {
+                                "#text": block.get("text"),
+                                "@X": "0",
+                                "@Y": f"{block.get('size') / self.OP}",
+                                "@DeltaX": f"g {count - 1} {length / count}"
+                            },
+                            "@size": block.get("size") / self.OP,
+                            "@Boundary": f"{x0} {y0} {length} {height}",
+                        })
+                    elif block["type"] == "img":
+                        ImageObject.append({
+                            "@ID": 0,
+                            "res_uuid": block.get("res_uuid"),  # 资源标识
+                            "@Boundary": f"{x0} {y0} {length} {height}",
+                            "@ResourceID": f""  # 需要关联public res 里面的结果
+                        })
+                # for i in content:
+                #     if i["type"] == "img":
+                #         ImageObject.append(i)
+                #     elif i["type"] == "text":
+                #         TextObject.append(i)
+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
+                                         CGTransform=[], PathObject=[], TextObject=TextObject, id_obj=id_obj)
+                # print(conten)
+                content_res_list.append(conten)
+        else:
+            pass
+        return content_res_list
+    def pil_2_bytes(self, image):
+        """"""
+        # 创建一个 BytesIO 对象
+        img_bytesio = BytesIO()
+        # 将图像保存到 BytesIO 对象
+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
+        # 获取 BytesIO 对象中的字节
+        img_bytes = img_bytesio.getvalue()
+        # 关闭 BytesIO 对象
+        img_bytesio.close()
+        return img_bytes
+    def __call__(self, pdf_bytes=None, pil_img_list=None, optional_text=False):
+        """
+        input pdf | imgs if pdf  >optional_text or not
+        0 解析pdf文件
+        1 构建必要的ofd template
+        2 转化为 ofd
+        """
+        pdf_obj = DPFParser()
+        page_pil_img_list = None
+        # 插入图片ofd
+        if pil_img_list:  # 读取 图片
+            page_pil_img_list = [(self.pil_2_bytes(_img), _img.size[0] / self.OP, _img.size[1] / self.OP) for _img in
+                                 pil_img_list]
+        else:  # 读取 pdf 转图片
+            if optional_text:  # 生成可编辑ofd:
+                pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes)  # 解析pdf
+                logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}")
+            else:
+                img_list = pdf_obj.to_img(pdf_bytes)
+                page_pil_img_list = [(self.pil_2_bytes(Image.frombytes("RGB", [_img.width, _img.height],
+                                                                       _img.samples)), _img.width / self.OP,
+                                      _img.height / self.OP) for _img in img_list]
+        id_obj = CurId()
+        if page_pil_img_list:  # img 内容转ofd
+            res_static = {}  # 图片资源
+            pfd_res_uuid_map = {"img": {}}
+            PhysicalBox = f"0 0 {page_pil_img_list[0][1]} {page_pil_img_list[0][2]}"
+            for idx, pil_img_tuple in enumerate(page_pil_img_list):
+                pfd_res_uuid_map["img"][f"{idx}"] = pil_img_tuple[0]
+                res_static[f"Image_{idx}.jpg"] = pil_img_tuple[0]
+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
+            document = self.build_document(len(page_pil_img_list), id_obj=id_obj, PhysicalBox=PhysicalBox)
+            public_res = self.build_public_res(id_obj=id_obj)
+            document_res = self.build_document_res(len(page_pil_img_list), id_obj=id_obj,
+                                                   pfd_res_uuid_map=pfd_res_uuid_map)
+            content_res_list = self.build_content_res(page_pil_img_list, id_obj=id_obj,
+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
+        else:
+            #  生成的文档结构对象需要传入id实例
+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
+            document = self.build_document(len(pdf_info_list), id_obj=id_obj)
+            public_res = self.build_public_res(id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
+            document_res = self.build_document_res(len(pdf_info_list), id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
+            content_res_list = self.build_content_res(pdf_info_list=pdf_info_list, id_obj=id_obj,
+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
+            res_static = {}  # 图片资源
+            print("pfd_res_uuid_map", pfd_res_uuid_map)
+            img_dict = pfd_res_uuid_map.get("img")
+            if img_dict:
+                for key, v_io in img_dict.items():
+                    res_static[f"Image_{key}.jpg"] = v_io.getvalue()
+        # 生成 ofd 文件
+        ofd_byte = OFDStructure("123", ofd=ofd_entrance, document=document, public_res=public_res,
+                                document_res=document_res, content_res=content_res_list, res_static=res_static)(
+            test=True)
+        return ofd_byte
+if __name__ == "__main__":
+    pdf_p = r"D:\renodoc\技术栈\GBT_33190-2016_电子文件存储与交换格式版式文档.pdf"
+    pdf_p = r"F:\code\easyofd\test"
+    with open(pdf_p, "rb") as f:
+        content = f.read()
+    ofd_content = OFDWrite()(content)
+    with open("ofd.ofd", "wb") as f:
+        f.write(ofd_content)
--- a/magic_pdf/rw/draw_pdf.py
+++ b/magic_pdf/rw/draw_pdf.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# PROJECT_NAME: E:\code\easyofd\easyofd\draw
+# CREATE_TIME: 2023-08-10
+# E_MAIL: renoyuan@foxmail.com
+# AUTHOR: reno
+# NOTE:  绘制pdf
+import base64
+import os
+import re
+import traceback
+from io import BytesIO
+from PIL import Image as PILImage
+from loguru import logger
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen import canvas
+from magic_pdf.tools.font_tools import FontTool
+from magic_pdf.tools.find_seal_img import SealExtract
+# print(reportlab_fonts)
+class DrawPDF():
+    """
+    ofd 解析结果 绘制pdf
+    OP ofd 单位转换
+    """
+    def __init__(self, data, *args, **kwargs):
+        assert data, "未输入ofd解析结果"
+        self.data = data
+        self.author = "sugon"
+        self.OP = 200 / 25.4
+        # self.OP = 1
+        self.pdf_uuid_name = self.data[0]["pdf_name"]
+        self.pdf_io = BytesIO()
+        self.SupportImgType = ("JPG", "JPEG", "PNG")
+        self.init_font = "宋体"
+        self.font_tool = FontTool()
+    def draw_lines(my_canvas):
+        """
+        draw_line
+        """
+        my_canvas.setLineWidth(.3)
+        start_y = 710
+        my_canvas.line(30, start_y, 580, start_y)
+        for x in range(10):
+            start_y -= 10
+            my_canvas.line(30, start_y, 580, start_y)
+    def gen_empty_pdf(self):
+        """
+        """
+        c = canvas.Canvas(self.pdf_io)
+        c.setPageSize(A4)
+        c.setFont(self.init_font, 20)
+        c.drawString(0, 210, "ofd 格式错误,不支持解析", mode=1)
+        c.save()
+    # 单个字符偏移量计算
+    def cmp_offset(self, pos, offset, DeltaRule, text, CTM_info, dire="X") -> list:
+        """
+        pos 文本框x|y 坐标 
+        offset 第一个字符的X|Y 
+        DeltaRule 偏移量规则
+        resize 字符坐标缩放
+        返回 x|y  字符位置list 
+        """
+        if CTM_info and dire == "X":
+            resize = CTM_info.get("resizeX")
+            rotate = CTM_info.get("rotateX")
+            move= CTM_info.get("moveX")
+        elif CTM_info and dire == "Y":
+            resize = CTM_info.get("resizeY")
+            rotate = CTM_info.get("rotateY")
+            move = CTM_info.get("moveY")
+        else:
+            resize = 1
+            rotate = 0
+            move = 0
+        char_pos = float(pos if pos else 0) + (float(offset if offset else 0) + move) * resize
+        pos_list = []
+        pos_list.append(char_pos)  # 放入第一个字符
+        offsets = [i for i in DeltaRule.split(" ")]
+        if "g" in DeltaRule:  # g 代表多个元素
+            g_no = None
+            for _no, offset_i in enumerate(offsets):
+                if offset_i == "g":
+                    g_no = _no
+                    for j in range(int(offsets[(g_no + 1)])):
+                        char_pos += float(offsets[(g_no + 2)])
+                        pos_list.append(char_pos)
+                elif offset_i != "g":
+                    if g_no == None:
+                        char_pos += float(offset_i) * resize
+                        pos_list.append(char_pos)
+                    elif (int(_no) > int(g_no + 2)) and g_no != None:
+                        char_pos += float(offset_i) * resize
+                        pos_list.append(char_pos)
+        elif not DeltaRule:  # 没有字符偏移量 一般单字符
+            pos_list = []
+            for i in range(len(text)):
+                pos_list.append(char_pos)
+        else:  # 有字符偏移量
+            for i in offsets:
+                if not i:
+                    char_pos += 0
+                else:
+                    char_pos += float(i) * resize
+                pos_list.append(char_pos)
+        return pos_list
+    def draw_chars(self, canvas, text_list, fonts, page_size):
+        """写入字符"""
+        c = canvas
+        for line_dict in text_list:
+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
+            text = line_dict.get("text")
+            font_info = fonts.get(line_dict.get("font"), {})
+            if font_info:
+                font_name = font_info.get("FontName", "")
+            else:
+                font_name = self.init_font
+            # TODO 判断是否通用已有字体 否则匹配相近字体使用
+            if font_name not in self.font_tool.FONTS:
+                font_name = self.font_tool.FONTS[0]
+            font = font_name
+            # if font not in FONT: #  KeyError: 'SWDRSO+KaiTi-KaiTi-0'
+            c.setFont(font, line_dict["size"] * self.OP)
+            # 原点在页面的左下角 
+            color = line_dict.get("color", [0, 0, 0])
+            if len(color) < 3:
+                color = [0, 0, 0]
+            c.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+            c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
+            DeltaX = line_dict.get("DeltaX", "")
+            DeltaY = line_dict.get("DeltaY", "")
+            # print("DeltaX",DeltaX)
+            X = line_dict.get("X", "")
+            Y = line_dict.get("Y", "")
+            CTM = line_dict.get("CTM", "")  # 因为ofd 增加这个字符缩放
+            resizeX = 1
+            resizeY = 1
+            # CTM =None # 有的数据不使用这个CTM
+            if CTM and (CTMS:=CTM.split(" ")) and len(CTMS) == 6:
+                CTM_info = {
+                    "resizeX": float(CTMS[0]),
+                    "rotateX": float(CTMS[1]),
+                    "rotateY": float(CTMS[2]),
+                    "resizeY": float(CTMS[3]),
+                    "moveX": float(CTMS[4]),
+                    "moveY": float(CTMS[5]),
+                }
+            else:
+                CTM_info ={}
+            x_list = self.cmp_offset(line_dict.get("pos")[0], X, DeltaX, text, CTM_info, dire="X")
+            y_list = self.cmp_offset(line_dict.get("pos")[1], Y, DeltaY, text, CTM_info, dire="Y")
+            # print("x_list",x_list)
+            # print("y_list",y_list)
+            # print("Y",page_size[3])
+            # print("x",page_size[2])
+            # if line_dict.get("Glyphs_d") and  FontFilePath.get(line_dict["font"])  and font_f not in FONTS:
+            if False:  # 对于自定义字体 写入字形 drawPath 性能差暂时作废
+                Glyphs = [int(i) for i in line_dict.get("Glyphs_d").get("Glyphs").split(" ")]
+                for idx, Glyph_id in enumerate(Glyphs):
+                    _cahr_x = float(x_list[idx]) * self.OP
+                    _cahr_y = (float(page_size[3]) - (float(y_list[idx]))) * self.OP
+                    imageFile = draw_Glyph(FontFilePath.get(line_dict["font"]), Glyph_id, text[idx])
+                    # font_img_info.append((FontFilePath.get(line_dict["font"]), Glyph_id,text[idx],_cahr_x,_cahr_y,-line_dict["size"]*Op*2,line_dict["size"]*Op*2))
+                    c.drawImage(imageFile, _cahr_x, _cahr_y, -line_dict["size"] * self.OP * 2,
+                                line_dict["size"] * self.OP * 2)
+            else:
+                if len(text) > len(x_list) or len(text) > len(y_list):
+                    text = re.sub("[^\u4e00-\u9fa5]", "", text)
+                try:
+                    # 按行写入  最后一个字符y  算出来大于 y轴  最后一个字符x  算出来大于 x轴 
+                    if y_list[-1] * self.OP > page_size[3] * self.OP or x_list[-1] * self.OP > page_size[2] * self.OP or \
+                            x_list[-1] < 0 or y_list[-1] < 0:
+                        # print("line wtite")
+                        x_p = abs(float(X)) * self.OP
+                        y_p = abs(float(page_size[3]) - (float(Y))) * self.OP
+                        c.drawString(x_p, y_p, text, mode=0)  # mode=3 文字不可见 0可見
+                        # text_write.append((x_p,  y_p, text))
+                    # 按字符写入
+                    else:
+                        for cahr_id, _cahr_ in enumerate(text):
+                            # print("char wtite")
+                            c.setFont(font, line_dict["size"] * self.OP * resizeX)
+                            _cahr_x = float(x_list[cahr_id]) * self.OP
+                            _cahr_y = (float(page_size[3]) - (float(y_list[cahr_id]))) * self.OP
+                            # print(_cahr_x,  _cahr_y, _cahr_)
+                            c.drawString(_cahr_x, _cahr_y, _cahr_, mode=0)  # mode=3 文字不可见 0可見
+                            # text_write.append((_cahr_x,  _cahr_y, _cahr_))
+                except Exception as e:
+                    logger.error(f"{e}")
+                    traceback.print_exc()
+    def draw_img(self, canvas, img_list, images, page_size):
+        """写入图片"""
+        c = canvas
+        for img_d in img_list:
+            image = images.get(img_d["ResourceID"])
+            if not image or image.get("suffix").upper() not in self.SupportImgType:
+                continue
+            imgbyte = base64.b64decode(image.get('imgb64'))
+            if not imgbyte:
+                logger.error(f"{image['fileName']} is null")
+                continue
+            img = PILImage.open(BytesIO(imgbyte))
+            imgReade = ImageReader(img)
+            CTM = img_d.get('CTM')
+            x_offset = 0
+            y_offset = 0
+            wrap_pos = image.get("wrap_pos")
+            x = (img_d.get('pos')[0] + x_offset) * self.OP
+            y = (page_size[3] - (img_d.get('pos')[1] + y_offset)) * self.OP
+            if wrap_pos:
+                x = x + (wrap_pos[0] * self.OP)
+                y = y - (wrap_pos[1] * self.OP)
+            w = img_d.get('pos')[2] * self.OP
+            h = -img_d.get('pos')[3] * self.OP
+            c.drawImage(imgReade, x, y, w, h, 'auto')
+    def draw_signature(self, canvas, signatures_page_list, page_size):
+        """
+        写入签章
+            {
+            "sing_page_no": sing_page_no,
+            "PageRef": PageRef,
+            "Boundary": Boundary,
+            "SignedValue": self.file_tree(SignedValue),
+                            }
+        """
+        c = canvas
+        try:
+            if signatures_page_list:
+                # print("signatures_page_list",signatures_page_list)
+                for signature_info in signatures_page_list:
+                    image = SealExtract()(b64=signature_info.get("SignedValue"))
+                    if not image:
+                        logger.info(f"提取不到签章图片")
+                        continue
+                    else:
+                        image_pil = image[0]
+                    pos = [float(i) for i in signature_info.get("Boundary").split(" ")]
+                    imgReade = ImageReader(image_pil)
+                    x = pos[0] * self.OP
+                    y = (page_size[3] - pos[1]) * self.OP
+                    w = pos[2] * self.OP
+                    h = -pos[3] * self.OP
+                    c.drawImage(imgReade, x, y, w, h, 'auto')
+                    print(f"签章写入成功")
+            else:
+                # 无签章
+                pass
+        except Exception as e:
+            print(f"签章写入失败 {e}")
+            traceback.print_exc()
+    def draw_line(self, canvas, line_list, page_size):
+        """绘制线条"""
+        # print("绘制",line_list)
+        def match_mode(Abbr: list):
+            """
+            解析AbbreviatedData
+            匹配各种线条模式
+            S 定义起始 坐标 x, y
+            M 移动到指定坐标 x, y
+            L 从当前点移动到指定点 x, y
+            Q x1 y1 x2 y2 二次贝塞尔曲线
+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线
+            A 到 x,y 的圆弧 并移动到 x,y  rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转
+            C 当前点和SubPath自动闭合
+            """
+            relu_list = []
+            mode = ""
+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
+            mode_dict = {}
+            for idx, i in enumerate(Abbr):
+                if i in modes:
+                    mode = i
+                    if mode_dict:
+                        relu_list.append(mode_dict)
+                    mode_dict = {"mode": i, "points": []}
+                else:
+                    mode_dict["points"].append(i)
+                if idx + 1 == len(Abbr):
+                    relu_list.append(mode_dict)
+            return relu_list
+        def assemble(relu_list: list):
+            start_point = {}
+            acticon = []
+            for i in relu_list:
+                if i.get("mode") == "M":
+                    start_point = i
+                elif i.get("mode") in ['B', "Q", 'L']:
+                    acticon.append({"start_point": start_point,
+                                    "end_point": i
+                                    })
+            return acticon
+        def convert_coord(p_list, direction, page_size, pos):
+            """坐标转换ofd2pdf"""
+            new_p_l = []
+            for p in p_list:
+                if direction == "x":
+                    new_p = (float(pos[0]) + float(p)) * self.OP
+                else:
+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
+                new_p_l.append(new_p)
+            return new_p_l
+        for line in line_list:
+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData 
+            color = line.get("FillColor", [0, 0, 0])
+            relu_list = match_mode(Abbr)
+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
+            # print(relu_list)
+            acticons = assemble(relu_list)
+            pos = line.get("pos")
+            # print(color)
+            if len(color) < 3:
+                color = [0, 0, 0]
+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
+            # 设置线条宽度
+            try:
+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
+            except Exception as e:
+                logger.error(f"{e}")
+                LineWidth = 0.25 * self.OP
+            canvas.setLineWidth(LineWidth)  # 单位为点，2 表示 2 点
+            for acticon in acticons:
+                if acticon.get("end_point").get("mode") == 'L':  # 直线
+                    x1, y1, x2, y2 = *acticon.get("start_point").get("points"), *acticon.get("end_point").get("points")
+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
+                    # 绘制一条线 x1 y1 x2 y2
+                    canvas.line(x1, y1, x2, y2)
+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
+                    continue
+                    x1, y1, x2, y2, x3, y3, x4, y4 = *acticon.get("start_point").get("points"), *acticon.get(
+                        "end_point").get("points")
+                    x1, x2, x3, x4 = convert_coord([x1, x2, x3, x4], "x", page_size, pos)
+                    y1, y2, y3, y4 = convert_coord([y1, y2, y3, y4], "y", page_size, pos)
+                    # print(x1, y1, x2, y2, x3, y3, x4, y4)
+                    # 绘制三次贝塞尔线
+                    canvas.bezier(x1, y1, x2, y2, x3, y3, x4, y4)
+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
+                    pass
+                else:
+                    continue
+    def draw_pdf(self):
+        c = canvas.Canvas(self.pdf_io)
+        c.setAuthor(self.author)
+        for doc_id, doc in enumerate(self.data, start=0):
+            # print(1)
+            fonts = doc.get("fonts")
+            images = doc.get("images")
+            default_page_size = doc.get("default_page_size")
+            page_size_details = doc.get("page_size")
+            print("page_size_details", page_size_details)
+            signatures_page_id = doc.get("signatures_page_id")  # 签证信息
+            # 注册字体
+            for font_id, font_v in fonts.items():
+                file_name = font_v.get("FontFile")
+                font_b64 = font_v.get("font_b64")
+                if font_b64:
+                    self.font_tool.register_font(os.path.split(file_name)[1], font_v.get("@FontName"), font_b64)
+            # text_write = []
+            # print("doc.get(page_info)", len(doc.get("page_info")))
+            for page_id, page in doc.get("page_info").items():
+                if page_size_details[page_id]:
+                    page_size = page_size_details[page_id]
+                else:
+                    page_size = default_page_size
+                # logger.info(f"page_id {page_id} page_size {page_size}")
+                text_list = page.get("text_list")
+                img_list = page.get("img_list")
+                line_list = page.get("line_list")
+                # print("img_list",img_list)
+                c.setPageSize((page_size[2] * self.OP, page_size[3] * self.OP))
+                # 写入图片
+                if img_list:
+                    self.draw_img(c, img_list, images, page_size)
+                # 写入文本
+                if text_list:
+                    self.draw_chars(c, text_list, fonts, page_size)
+                # 绘制线条
+                if line_list:
+                    self.draw_line(c, line_list, page_size)
+                # 绘制签章
+                if signatures_page_id:
+                    self.draw_signature(c, signatures_page_id.get(page_id), page_size)
+                # print("去写入")
+                # print(doc_id,len(self.data))
+                # 页码判断逻辑
+                if page_id != len(doc.get("page_info")) - 1 and doc_id != len(self.data):
+                    # print("写入")
+                    c.showPage()
+                    # json.dump(text_write,open("text_write.json","w",encoding="utf-8"),ensure_ascii=False)
+        c.save()
+    def __call__(self):
+        try:
+            self.draw_pdf()
+            pdfbytes = self.pdf_io.getvalue()
+        except Exception as e:
+            logger.error(f"{e}")
+            logger.error(f"ofd解析失败")
+            traceback.print_exc()
+            self.gen_empty_pdf()
+            pdfbytes = self.pdf_io.getvalue()
+        return pdfbytes
--- a/magic_pdf/rw/ofdtemplate.py
+++ b/magic_pdf/rw/ofdtemplate.py
+#!/usr/bin/env python
+#-*- coding: utf-8 -*-
+#PROJECT_NAME: F:\code\easyofd\easyofd\draw
+#CREATE_TIME: 2023-10-30 
+#E_MAIL: renoyuan@foxmail.com
+#AUTHOR: reno 
+#note:  ofd 基础结构模板
+import tempfile
+import os
+import abc
+import copy
+from loguru import logger
+import xmltodict
+import zipfile
+__all__ = ["CurId", "OFDTemplate", "DocumentTemplate", "DocumentResTemplate",
+           "PublicResTemplate", "ContentTemplate", "OFDStructure"]
+"""
+OFD目录结构
+    │  OFD.xml
+    │  
+    └─Doc_0
+        │  Document.xml
+        │  DocumentRes.xml
+        │  PublicRes.xml
+        │  
+        ├─Annots
+        │  │  Annotations.xml
+        │  │  
+        │  └─Page_0
+        │          Annotation.xml
+        │          
+        ├─Attachs
+        │      Attachments.xml
+        │      original_invoice.xml
+        │      
+        ├─Pages
+        │  └─Page_0
+        │          Content.xml
+        │          
+        ├─Res
+        │      image_80.jb2
+        │      
+        ├─Signs
+        │  │  Signatures.xml
+        │  │  
+        │  └─Sign_0
+        │          Signature.xml
+        │          SignedValue.dat
+        │          
+        ├─Tags
+        │      CustomTag.xml
+        │      CustomTags.xml
+        │      
+        └─Tpls
+            └─Tpl_0
+                    Content.xml
+"""
+class CurId(object):
+    """文档内id控制对象"""
+    def __init__(self):
+        self.id = 1
+        self.used = False
+        self.uuid_map = {} # 资源文件生成id的时候手动添加进来后面构建page 可以 匹配ResourceID
+    def add_uuid_map(self, k, v):
+        logger.debug(f"uuid_map add {k}: {v}")
+        self.uuid_map[k] = v
+    def add(self):
+        self.id += 1
+    def get_id(self):
+        if self.used:
+            self.add()
+            return self.id
+        if not self.used:
+            cur_id = self.id
+            self.used =True
+            return cur_id
+    def get_max_id(self):
+        MaxUnitID = self.id + 1
+        return MaxUnitID
+class TemplateBase(object):
+    """模板基类"""
+    key_map = {}  # 变量名对应 xml 中形式 映射 如 传入   DocID -> ofd:DocID
+    id_keys = [ ]  # 对需要的要素添加 "@ID"
+    template_name = ""
+    def __init__(self,*args,**kwargs):
+        # print(args)
+        # print(kwargs)
+        self.id_obj: CurId = kwargs.get("id_obj")
+        # print("id_obj", self.id_obj)
+        self.assemble(*args, **kwargs)
+    def assemble(self,*args, **kwargs):
+        """对ofdjson组装"""
+        self.final_json = copy.deepcopy(self.ofdjson)
+        # 往模板里面添加要素值
+        if kwargs:
+            for k, v in kwargs.items():
+                if k in self.key_map:
+                    self.modify(self.final_json, self.key_map[k], v)
+        # 添加id
+        for id_key in self.id_keys:
+            print(f"开始gen_id >> {self.template_name}>>{id_key}")
+            # print(f"final_json {self.final_json}")
+            self.gen_id(self.final_json, id_key)
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+                        # logger.info(f"添加id ->i {i}")
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+    def modify(self, ofdjson, key, value):
+        """对指定key的值更改  多个会统一改"""
+        for k, v in ofdjson.items():
+            if k == key:
+                ofdjson[k] = value
+            elif isinstance(v, dict):
+                self.modify(v, key, value)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.modify(v_cell, key, value)
+    def save(self, path):
+        xml_data = xmltodict.unparse(self.final_json, pretty=True)
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(xml_data)
+class OFDTemplate(TemplateBase):
+    """根节点全局唯一 OFD.xml"""
+    template_name = "OFD"
+    key_map = {"Author": "ofd:Author", "DocID": "ofd:DocID"  ,"CreationDate": "ofd:CreationDate"
+    }
+    ofdjson = {
+        "ofd:OFD": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@Version": "1.1",
+            "@DocType": "OFD",
+            "ofd:DocBody": [{
+                "ofd:DocInfo": {
+                    "ofd:DocID": "0C1D4F7159954EEEDE517F7285E84DC4",
+                    "ofd:Creator": "easyofd",
+                    "ofd:author": "renoyuan",
+                    "ofd:authoremail": "renoyuan@foxmail.com",
+                    "ofd:CreatorVersion": "1.0",
+                    "ofd:CreationDate": "2023-10-27"
+                },
+                "ofd:DocRoot": "Doc_0/Document.xml"
+            }]
+        }
+    }
+class DocumentTemplate(TemplateBase):
+    """DOC 内唯一 表示DOC内部结构 Document.xml
+    """
+    template_name = "Document"
+    key_map = {"Page": "ofd:Page","PhysicalBox":"ofd:PhysicalBox"}
+    id_keys = ["ofd:Page"]
+    ofdjson ={
+    "ofd:Document": {
+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+        "ofd:CommonData": {
+            "ofd:MaxUnitID": 0,
+            "ofd:PageArea": {
+                "ofd:PhysicalBox": "0 0 140 90"
+            },
+            "ofd:PublicRes": "PublicRes.xml",
+            "ofd:DocumentRes": "DocumentRes.xml"
+        },
+        "ofd:Pages":
+            {
+            "ofd:Page": [{
+                "@ID": 0,
+                "@BaseLoc": "Pages/Page_0/Content.xml"
+            }]
+        }
+    }
+}
+    def update_max_unit_id(self, final_json=None):
+        if not final_json:
+            final_json = self.final_json
+        for k, v in final_json.items():
+            if k == "ofd:MaxUnitID":
+                final_json["ofd:MaxUnitID"]=self.id_obj.get_max_id()
+                return
+            elif isinstance(v, dict):
+                self.update_max_unit_id(v)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.update_max_unit_id(v_cell)
+    def update_page(self,page_num):
+        pass
+class DocumentResTemplate(TemplateBase):
+    """DOC 内唯一 表示MultyMedia 资源信息 如 图片 DocumentRes.xml """
+    template_name = "DocumentRes"
+    key_map = {"MultiMedia": "ofd:MultiMedia"}
+    id_keys = ["ofd:DrawParam", "ofd:MultiMedia"]
+    ofdjson = {
+        "ofd:Res": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@BaseLoc": "Res",
+            "ofd:MultiMedias": {
+                "ofd:MultiMedia": [
+                    {
+                        "@ID": 0,
+                        "@Type": "Image",
+                        "ofd:MediaFile": "Image_2.jpg"
+                    }
+                ]
+            }
+        }
+    }
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+                    if res_uuid := ofdjson[k].get("res_uuid"):
+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+                        if res_uuid := i.get("res_uuid"):
+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
+                        # logger.info(f"添加id ->i {i}")
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+class PublicResTemplate(TemplateBase):
+    """DOC 内唯一 公共配置资源信息 如 Font  Color 等 PublicRes.xml"""
+    template_name = "PulicRes"
+    key_map = {"Font": "ofd:Font"}
+    id_keys = ["ofd:ColorSpace", "ofd:Font"]
+    ofdjson = {
+        "ofd:Res": {
+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+            "@BaseLoc": "Res",
+            "ofd:ColorSpaces": {
+                "ofd:ColorSpace": {
+                    "@ID": 0,
+                    "@Type": "RGB",
+                    "@BitsPerComponent": "8",
+                    "#text":""
+                }
+            },
+            "ofd:Fonts": {
+                "ofd:Font": [
+                {
+                    "@ID": 0,
+                    "@FontName": "宋体",
+                    "@FamilyName": "宋体",
+                }
+            ]
+            }
+        }
+    }
+    def gen_id(self,ofdjson, id_key):
+        """生成id"""
+        # print("id_key ", id_key, "ofdjson ", ofdjson)
+        for k, v in ofdjson.items():
+            if k == id_key:
+                # 添加id
+                if isinstance(ofdjson[k], dict):
+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
+                    if res_uuid := ofdjson[k].get("res_uuid"):
+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
+                    # logger.info(f"添加id -> {ofdjson[k]}")
+                elif isinstance(ofdjson[k], list):
+                    for i in ofdjson[k]:
+                        i["@ID"] = f"{self.id_obj.get_id()}"
+                        if res_uuid := i.get("res_uuid"):
+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
+                        # logger.info(f"添加id ->i {i}")
+            elif isinstance(v, dict):
+                # logger.debug(f"dict_v{v}")
+                self.gen_id(v, id_key)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        # logger.debug(f"dict_v{v}")
+                        self.gen_id(v_cell, id_key)
+'''
+    "ofd:Font": [
+    {
+        "@ID": 0,
+        "@FontName": "STSong",
+        "@FamilyName": "SimSun",
+        "@Serif": "true",
+        "@FixedWidth": "true",
+        "@Charset": "prc"
+    }
+            "ofd:Area": {
+            "ofd:PhysicalBox": "0 0 210 140"
+        },
+'''
+class ContentTemplate(TemplateBase):
+    """正文部分 Content.xml"""
+    #"@Type": "Body",
+    template_name = "Content"
+    key_map = {"ImageObject": "ofd:ImageObject",
+               "PathObject": "ofd:PathObject",
+               "TextObject": "ofd:TextObject",
+               "CGTransform": "ofd:CGTransform",
+               "PhysicalBox": "ofd:PhysicalBox",
+               }
+    id_keys = ["ofd:Layer", "ofd:TextObject", "ofd:PathObject", "ofd:Clips", "ofd:ImageObject"]
+    correlate_map = {"ofd:TextObject": "@Font",
+                     "ofd:ImageObject": "@ResourceID"
+                     }
+    ofdjson = {
+    "ofd:Page": {
+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
+        "ofd:Content": {
+            "ofd:PageArea": {
+                "ofd:PhysicalBox": "0 0 210 140"
+            },
+            "ofd:Layer":  {
+                "@ID": 0,
+                "@Type": "Foreground",
+                "ofd:TextObject": [{
+                        "@ID": 0,
+                        "@CTM": "7.054 0 0 7.054 0 134.026",
+                        "@Boundary": "69 7 72 7.6749",
+                        "@Font": "69",
+                        "@Size": "6.7028",
+                        "ofd:FillColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "156 82 35"
+                        },
+                        "ofd:CGTransform": {
+                            "@CodePosition": "0",
+                            "@CodeCount": "10",
+                            "@GlyphCount": "10",
+                            "ofd:Glyphs": "18 10 11 42 60 53 24 11 42 61"
+                        },
+                        "ofd:TextCode": {
+                            "@X": "13.925",
+                            "@Y": "10",
+                            "@DeltaX": "7 7 7 7 7 7 7 7 7",
+                            "#text": "电⼦发票（普通发票）"
+                        }
+                    }],
+                "ofd:ImageObject": []
+                }
+        }}}
+    def __init__(self,*args,**kwargs):
+        # print(args)
+        # print(kwargs)
+        super().__init__(*args, **kwargs)
+        # 关联res_uuid
+        for key, targe_key in self.correlate_map.items():
+            self.correlate_res_uuid(self.final_json,key,targe_key)
+    def correlate_res_uuid(self, ofdjson,key,targe_key):
+        """correlate_res_uuid"""
+        print("========uuid_map", self.id_obj.uuid_map)
+        for k, v in ofdjson.items():
+            if k == key:
+                if isinstance(v, dict) and (res_uuid := v_cell.pop("res_uuid", None)):
+                    v[targe_key] = self.id_obj.uuid_map[res_uuid]
+                    logger.debug(f'{targe_key} >>> {v[targe_key]} -- {res_uuid}')
+                elif isinstance(v, list):
+                    for v_cell in v:
+                        if isinstance(v_cell, dict) and (res_uuid := v_cell.pop("res_uuid", None)):
+                            v_cell[targe_key] = self.id_obj.uuid_map[res_uuid]
+                            logger.debug(f'{targe_key} >>> {v_cell[targe_key]} -- {res_uuid}')
+                        else:
+                            print(f"v_cell {v_cell}")
+                    pass
+                else:
+                    pass
+            elif isinstance(v, dict):
+                self.correlate_res_uuid(v, key, targe_key)
+            elif isinstance(v, list):
+                for v_cell in v:
+                    if isinstance(v_cell, dict):
+                        self.correlate_res_uuid(v_cell, key, targe_key)
+'''
+                "ofd:PathObject": [{
+                        "@ID": 0,
+                        "@CTM": "0.3527 0 0 -0.3527 0.35 141.43001",
+                        "@Boundary": "-0.35 -0.35 212.33 141.78999",
+                        "@LineWidth": "1",
+                        "@MiterLimit": "10",
+                        "@Stroke": "false",
+                        "@Fill": "true",
+                        "ofd:FillColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "255 255 255"
+                        },
+                        "ofd:StrokeColor": {
+                            "@ColorSpace": "4",
+                            "@Value": "0 0 0"
+                        },
+                        "ofd:Clips": {
+                            "ofd:Clip": {
+                                "ofd:Area": {
+                                    "ofd:Path": {
+                                        "@ID": 0,
+                                        "@Boundary": "0.00766 -0.00763 600 400.00003",
+                                        "@Stroke": "false",
+                                        "@Fill": "true",
+                                        "ofd:AbbreviatedData": "M 0 0 L 600 0 L 600 400.00003 L 0 400.00003 C"
+                                    }
+                                }
+                            }
+                        },
+                        "ofd:AbbreviatedData": "M -1 401 L 601 401 L 601 -1 L -1 -1 C"
+                    },],
+"ofd:ImageObject": [{
+                        "@ID": 0,
+                        "@CTM": "19.7512 0 0 19.7512 0 0",
+                        "@Boundary": "7.23035 7.40671 19.7512 19.7512",
+                        "@ResourceID": "104"
+                    }],
+'''
+class OFDStructure(object):
+    """OFD structure"""
+    def __init__(self, name, ofd=None, document=None,
+                 document_res=None, public_res=None,
+                  content_res:list=[], res_static: dict={}):
+        # 初始化的时候会先自动初始化 默认参数值
+        id_obj = CurId()
+        self.name = name
+        self.ofd = ofd if ofd else OFDTemplate(id_obj=id_obj)
+        self.document = document if document else DocumentTemplate(id_obj=id_obj)
+        self.document_res = document_res if document_res else  DocumentResTemplate(id_obj=id_obj)
+        self.public_res = public_res if public_res else PublicResTemplate(id_obj=id_obj)
+        self.content_res = content_res if content_res else [ContentTemplate(id_obj=id_obj)]
+        self.res_static = res_static
+    def __call__(self, test=False):
+        """写入文件生成ofd"""
+        with tempfile.TemporaryDirectory() as t_dir:
+            if test:
+                temp_dir = r"./test"
+                os.mkdir(temp_dir)
+            else:
+                temp_dir = t_dir
+            # 创建过程目录
+            temp_dir_doc_0 = os.path.join(temp_dir, 'Doc_0')
+            temp_dir_pages = os.path.join(temp_dir, 'Doc_0', "Pages")
+            temp_dir_res = os.path.join(temp_dir, 'Doc_0', "Res")  # 静态资源路径
+            for i in [temp_dir_doc_0, temp_dir_pages, temp_dir_res]:
+                # print(i)
+                os.mkdir(i)
+            # 写入 OFD
+            self.ofd.save(os.path.join(temp_dir, 'OFD.xml'))
+            # 更新 max_unit_id & 写入 Document
+            self.document.update_max_unit_id()
+            self.document.save(os.path.join(temp_dir_doc_0, 'Document.xml'))
+            # 写入 DocumentRes
+            self.document_res.save(os.path.join(temp_dir_doc_0, 'DocumentRes.xml'))
+            # 写入 PublicRes
+            self.public_res.save(os.path.join(temp_dir_doc_0, 'PublicRes.xml'))
+            # 写入 content_res
+            for idx, page in enumerate(self.content_res):
+                temp_dir_pages_idx = os.path.join(temp_dir_pages, f"Page_{idx}")
+                os.mkdir(temp_dir_pages_idx)
+                # os.mkdir(i)
+                page.save(os.path.join(temp_dir_pages_idx, 'Content.xml'))
+            # 写入静态资源
+            for k, v in self.res_static.items():
+                  with open(os.path.join(temp_dir_res, k), "wb") as f:
+                      f.write(v)
+            # 打包成ofd
+            zip = zipfile.ZipFile("test.ofd", "w", zipfile.ZIP_DEFLATED)
+            for path, dirnames, filenames in os.walk(temp_dir):
+                # 去掉目标跟路径，只对目标文件夹下边的文件及文件夹进行压缩
+                fpath = path.replace(temp_dir, '')
+                for filename in filenames:
+                    zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
+            zip.close()
+            with open("test.ofd", "rb") as f:
+                content = f.read()
+            if os.path.exists("test.ofd"):
+               os.remove("test.ofd")
+            return content
+if  __name__ == "__main__":
+    print("---------")
+    # 资源文件
+    img_path = r"F:\code\easyofd\test\test_img0.jpg"
+    # with open(img_path, "rb") as f:
+    #     content = f.read()
+    content = b""
+    res_static = {"Image_0.jpg": content}
+    # 构建数据
+    font = [
+            {
+                "@FontName": "宋体",
+                "@FamilyName": "宋体",
+            }
+            ]
+    MultiMedia = [
+                {
+                    "@Type": "Image",
+                    "ofd:MediaFile": "Image_0.jpg"
+                }
+            ]
+    ImageObject = [{
+                        "@CTM": "200 0 0 140 0 0",
+                        "@Boundary": "0 0 200 140",
+                        "@ResourceID": "55"
+                    }]
+    TextObject = [
+        {
+        "@Boundary": "50 5 100 20",
+        "@Font": "2",
+        "@Size": "5",
+        "ofd:FillColor": {
+            "@Value": "156 82 35",
+            "@ColorSpace" : "1"
+        },
+        "ofd:TextCode": {
+            "@X": "5",
+            "@Y": "5",
+            "@DeltaX": "7 7 7 7 7 7 7 7 7",
+            "#text": "电⼦发票（普通发票）"
+        }
+    }, {
+        "@Boundary": "0 0 100 100",
+        "@Font": "2",
+        "@Size": "10",
+        "ofd:FillColor": {
+            "@Value": "156 82 35"
+        },
+        "ofd:TextCode": {
+            "@X": "0",
+            "@Y": "0",
+            "@DeltaX": "0",
+            "#text": "电"
+        }
+    }
+    ]
+    # 实例化模板
+    id_obj = CurId()
+    print("id_obj实例化", id_obj)
+    ofd = OFDTemplate(id_obj=id_obj)
+    document = DocumentTemplate(id_obj=id_obj)
+    public_res = PublicResTemplate(Font=font, id_obj=id_obj)
+    document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
+    # ImageObject=ImageObject
+    content_res = ContentTemplate(CGTransform=[], PathObject=[], TextObject=TextObject, ImageObject=[], id_obj=id_obj)
+    ofd_byte = OFDStructure("123",ofd=ofd, document=document,public_res=public_res,
+                            document_res=document_res, content_res=[content_res], res_static=res_static)(test=True)
+    with open("test.ofd", "wb") as f:
+        content = f.write(ofd_byte)
--- a/magic_pdf/rw/pdf_parse.py
+++ b/magic_pdf/rw/pdf_parse.py
+import os
+import re
+import io
+import json
+import time
+import copy
+import string
+import random
+from uuid import uuid1
+from decimal import Decimal
+from collections import OrderedDict
+# 第三方包
+import fitz
+from PIL import Image
+# import pdfplumber
+__ALL__ = ['pdf_ocr',"DPFParser"]
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, bytes):
+            return str(obj)
+        elif isinstance(obj, Decimal):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+class DPFParser(object):
+    def __init__(self, ):
+        pass
+    def extract_text_with_details(self, pdf_bytes):
+        """
+        提取PDF每页的文本及其位置、字体信息。
+        :param pdf_path: PDF文件路径
+        :return: 包含每页文本及其详细信息的列表
+        [[
+        ]]
+        """
+        details_list = []
+        pdf_stream = io.BytesIO(pdf_bytes)
+        # 使用fitz.open直接打开BytesIO对象
+        with fitz.open(stream=pdf_stream, filetype="pdf") as doc:
+            res_uuid_map = {
+                "img": {},
+                "font": {},
+                "other": {}
+            } # 全局资源标识
+            for page_num in range(len(doc)):
+                page_details_list = []  # 页面内信息
+                page = doc.load_page(page_num)
+                rect = page.rect
+                width = rect.width
+                height = rect.height
+                if res_uuid_map["other"].get("page_size"):
+                    res_uuid_map["other"]["page_size"][page_num] = [width,height]
+                else :
+                    res_uuid_map["other"]["page_size"] = {page_num: [width, height]}
+                blocks = page.get_text("dict").get("blocks")  # 获取文本块信息
+                image_list = page.get_images(full=True)  # 获取页面上所有图片的详细信息
+                # print(blocks)
+                # 获取页面内文本信息
+                for block in blocks:
+                    block_text = block.get("text", "")
+                    block_rect = block["bbox"]  # 文本块的边界框，格式为[x0, y0, x1, y1]
+                    # 遍历块中的每一行
+                    for line in block.get("lines", []):
+                        line_text = line.get("spans", [{}])[0].get("text", "")  # 单行文本
+                        line_rect = line["bbox"]  # 行的边界框
+                        # 遍历行中的每一个跨度（span），获取字体信息
+                        for span in line.get("spans", []):
+                            span_text = span.get("text", "")
+                            font_size = span.get("size")  # 字体大小
+                            font_name = span.get("font")  # 字体名称
+                            res_uuid = None
+                            if font_name not in res_uuid_map["font"].values():
+                                res_uuid = str(uuid1())
+                                res_uuid_map["font"][res_uuid] = font_name
+                            else:
+                                keys = list(res_uuid_map["font"].keys())
+                                vs = list(res_uuid_map["font"].values())
+                                idx = vs.index(font_name)
+                                res_uuid =keys[idx]
+                            font_color = span.get("color")  # 字体颜色，默认可能没有
+                            span_rect = (
+                            line_rect[0], line_rect[1], line_rect[2], line_rect[3])  # 使用行的边界框作为参考，具体到单个字符或词可能需要更复杂的处理
+                            # 打印或存储信息
+                            print(
+                                f"Page: {page_num }, Text: '{span_text}', Font: {font_name}, Size: {font_size}, "
+                                f"Color: {font_color}, Rect: {span_rect} ,res_uuid {res_uuid}")
+                            # 存储信息到details_list中（根据需要调整存储格式）
+                            page_details_list.append({
+                                "page": page_num,
+                                "text": span_text,
+                                "font": font_name,
+                                "res_uuid": res_uuid,
+                                "size": font_size,
+                                "color": font_color,
+                                "bbox": list(span_rect),
+                                "type": "text"
+                            })
+                for image_index, img_info in enumerate(image_list):
+                    # 解析图片信息
+                    xref = img_info[0]
+                    base_image = doc.extract_image(xref)
+                    image_data = base_image["image"]  # 图片数据
+                    res_uuid = str(uuid1())
+                    img_io = io.BytesIO(image_data)
+                    res_uuid_map["img"][res_uuid] = img_io
+                    image_type = base_image["ext"]  # 图片类型
+                    smask = base_image["smask"]  # 图片类型
+                    xres = base_image["xres"]  # 图片类型
+                    yres = base_image["yres"]  # 图片类型
+                    width = base_image["width"]  # 图片宽度
+                    height = base_image["height"]  # 图片高度
+                    # 计算坐标（左下角和右上角）
+                    x0, y0, x1, y1 = xres, yres,xres+width,yres+height
+                    print(
+                        f"Page: {page_num}, image_type: '{image_type}',x0{x0}, y0{y0}, x1{x1}, y1{y1}  ")
+                    page_details_list.append({
+                        "page": page_num,
+                        "index": image_index,
+                        "x0": x0,
+                        "y0": y0,
+                        "x1": x1,
+                        "y1": y1,
+                        "bbox": [x0,y0,width,height],
+                        "width": width,
+                        "height": height,
+                        "res_uuid": res_uuid,
+                        "image_type": image_type,
+                        "type": "img"
+                    })
+                details_list.append(page_details_list)
+        # print("details_list",details_list)
+        return details_list, res_uuid_map
+    def to_img(self, buffer_pdf):
+        """pdf2img"""
+        pix_list = []
+        pdfDoc = fitz.open(stream=buffer_pdf)
+        for pg in range(pdfDoc.page_count):
+            page = pdfDoc[pg]
+            rotate = int(0)
+            # 每个尺寸的缩放系数为1.3，这将为我们生成分辨率提高2.6的图像。
+            # 此处若是不做设置，默认图片大小为：792X612, dpi=96
+            zoom_x = 1.33333333 #(1.33333333-->1056x816)   (2-->1584x1224)
+            zoom_y = 1.33333333
+            # zoom_x,zoom_y = (1,1)
+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+            pix_list.append(pix)
+        return pix_list
+    def get_size(self):
+        pass
+def coast_time(func):
+    '''
+    计算对象执行耗时
+    '''
+    def fun(*agrs, **kwargs):
+        t = time.perf_counter()
+        result = func(*agrs, **kwargs)
+        print(f'function {func.__name__} coast time: {time.perf_counter() - t:.8f} s')
+        return result
+    return fun
+class BaseInit:
+    '''
+    解析pdf所需的基本信息
+    '''
+    def __init__(self, pdf_path, output_path):
+        self.file_path = pdf_path
+        self.output_path = output_path
+        # file_name
+        self.file_name = os.path.basename(self.file_path)
+        # file_type
+        self.fileType = os.path.splitext(self.file_path)[-1]
+        # no suffix
+        self.file_no_suffix = self.file_name[:-len(self.fileType)]
+        self.uuidChars = tuple(list(string.ascii_letters) + list(range(10)))
+        # 表格占位、分割符
+        self.divide = ':'
+        self.solid = ''
+        # 初始化整个过程需要创建的中间目录
+        # iou 占比
+        self.iou_rate = 0.001
+        self.init_file()
+    def init_file(self):
+        """
+        初始化项目过程中需要创建的文件夹
+        """
+        self.image_folder_path = os.path.join(self.output_path, 'pdf_img_save')
+        self.json_folder_path = os.path.join(self.output_path, 'json')
+        self.ocr_result_path = os.path.join(self.json_folder_path, self.file_no_suffix + '.json')
+        # 后面还有txt..., 目前的流程先需要5个
+        for path in [self.image_folder_path, self.json_folder_path]:
+            if not os.path.exists(path):
+                os.makedirs(path)
+    def genShortId(self, length=12):
+        """
+        :params length: 默认随机生成的uuid长度
+        """
+        uuid = str(uuid1()).replace('-', '')
+        result = ''
+        for i in range(0, 8):
+            sub = uuid[i * 4: i * 4 + 4]
+            x = int(sub, 16)
+            result += str(self.uuidChars[x % 0x3E])
+        return result + ''.join(random.sample(uuid, length - 8))
+class PageInfo(BaseInit):
+    '''
+    记录每页中的 图片和表格信息
+    '''
+    __page_image = {}
+    __page_table = {}
+    @classmethod
+    def add_image(cls, page_num, image):
+        if not cls.__page_image.get(page_num):
+            cls.__page_image[page_num] = []
+        cls.__page_image[page_num].append(image)
+    @classmethod
+    def add_table(cls, page_num, table):
+        if not cls.__page_table.get(page_num):
+            cls.__page_table[page_num] = []
+        cls.__page_table[page_num].append(table)
+    @classmethod
+    def get_image(cls, page_num):
+        return cls.__page_image.get(page_num, [])
+    @classmethod
+    def get_table(cls, page_num):
+        return cls.__page_table.get(page_num, [])
+    @classmethod
+    def save_image(cls, output_path, file):
+        '''
+        保存图片至本地
+        :param output:
+        :return:
+        '''
+        file = file.split('.')[0]
+        for images in cls.__page_image.values():
+            for image in images:
+                iamge_content = image['objContent']
+                name = image['name']
+                img_dir = os.path.join(output_path, 'page_img_save')
+                img_path = os.path.join(img_dir, file + '_' + name + '.jpg')
+                if not os.path.exists(img_dir):
+                    os.mkdir(img_dir)
+                with open(img_path, 'wb') as fp:
+                    fp.write(iamge_content)
+class ParseFile(PageInfo):
+    def __init__(self, pdf_path, output_path, table_type='v2', is_save=True):
+        super().__init__(pdf_path, output_path)
+        print('初始化 pdf 对象：{}'.format(self.file_path))
+        self.is_save = is_save
+        self.table_type = table_type
+        # 第一版结果列表： 行 表分开
+        self.page_result_list = []
+        # 第二版结果列表： 行表合并
+        self.combine_page_result_list = []
+    @coast_time
+    def get_result(self):
+        self.load_pdf()
+        result = self.parse_pdf()
+        self.ocr_result = result
+        print(f'解析完成：共 {len(result)} 页  表格类型： {self.table_type}')
+        return result
+    def load_pdf(self):
+        self.fitz_doc = fitz.open(self.file_path, filetype='pdf')
+        # self.pdfplum_doc_pages = pdfplumber.open(self.file_path).pages
+        # assert len(self.fitz_doc) == len(self.pdfplum_doc_pages)
+    def parse_pdf(self):
+        for page_no, fitz_doc in enumerate(self.fitz_doc):
+            # 测试
+            # if page_no != 25:
+            #     continue
+            self.height = fitz_doc.get_text('dict')['height']
+            self.width = fitz_doc.get_text('dict')['width']
+            # 聚合fitz页面解析的字符, 行, 块信息
+            line_list = self.group_block(page_no, fitz_doc)
+            # 获取页面表格信息
+            table_list = self.extract_table(page_no, self.pdfplum_doc_pages[page_no])
+            # 计算表格行列合并信息
+            table_list = list(CalcTableRL(table_list).run())
+            # 获取页面图片信息
+            image_list = self.get_image(page_no)
+            # 构造每页最终返回结果，
+            page_result = self.construct_final_result(line_list, page_no, image_list, table_list)
+            if self.table_type == 'v2':
+                # 合并成ocr所需格式：表格合并至行列表
+                combine_page_result_list = self.combine_table_v2(page_result)
+                page_result = self.construct_final_result(combine_page_result_list, page_no, image_list, table_list)
+            self.page_result_list.append(page_result)
+            if page_no and  page_no % 10 == 0:
+                print(f'解析前 {page_no} 页完成')
+        final_result_list = copy.deepcopy(self.page_result_list)
+        # 转换为符合ocr解析格式
+        if self.table_type == 'v2':
+            final_result_list = self.reform_ocr_result(final_result_list)
+        # 2023/09/26 保存之前加入 contIndex 给后续 抽取模型使用
+        for page_num, page in enumerate(final_result_list):
+            if not page.get('lineList'):
+                break
+            contIndex = {}
+            for line in page['lineList']:
+                line_bak = dict(copy.copy(line))
+                line_bak["objType_postpreprocess"] = f"{line_bak.get('objType','textLine')}_postpreprocess"
+                contIndex[line_bak["lineId"]] = line_bak
+            page["contIndex"] = contIndex
+            for line in page['lineList']:
+                print(page_num, line['objType'], line['objContent'])
+        # 保存至本地
+        if self.is_save:
+            self.save_result(final_result_list)
+        for page_num, page in enumerate(final_result_list):
+            for line in page['lineList']:
+                print(page_num, line['objType'], line['objContent'])
+        return final_result_list
+    def combine_table_v2(self, page_result):
+        lineList = page_result['lineList']
+        table_list = page_result['table_list']
+        # 先进行表格行、非表格行划分 减少后续操作的时间杂度
+        __notable_lines, __all_table_lines = self.filter_table_line(lineList, table_list)
+        notable_lines, all_table_lines = copy.deepcopy(__notable_lines), copy.deepcopy(__all_table_lines)
+        del __notable_lines, __all_table_lines, lineList
+        # 整合
+        combine_page_result_list = self.combine_table_with_line(notable_lines, all_table_lines, table_list)
+        return combine_page_result_list
+    def filter_table_line(self, lineList, table_list):
+        '''
+        筛选出属于表格的行、在 __notable_lines 属于表格的位置插庄 方便后续补全
+        __notable_lines： 非表格的行
+        __all_table_lines：属于表格的行
+        '''
+        __notable_lines = []
+        __all_table_lines = []
+        for table_info in table_list:
+            table_bbox = table_info['objPos']
+            # 属于当前表格的所有行
+            __sub_table_lines = []
+            is_iter_table = False
+            while lineList:
+                line = lineList.pop(0)
+                line_bbox = line['objPos']
+                # 空表格误判：行Y坐标已经超过表范围导致后续全都识别不到
+                table_y, line_y = table_bbox[3], line_bbox[1]
+                if line_y >= table_y:
+                    lineList.insert(0, line)
+                    break
+                iou = self.count_iou(table_bbox, line_bbox)
+                # 非表格区域
+                if iou > 0:
+                    __sub_table_lines.append(line)
+                    # 首次匹配到表格行
+                    if not is_iter_table:
+                        is_iter_table = True
+                        # 插入标记
+                        __notable_lines.append('table')
+                elif iou <= 0 and not is_iter_table:
+                    __notable_lines.append(line)
+                # 当前表格判断结束
+                elif iou <= 0 and is_iter_table:
+                    lineList.insert(0, line)
+                    line_index, flag = self.more_judge(table_bbox, lineList)
+                    if flag:
+                        # 跳至index位置继续后续判断
+                        # more_lines = copy.deepcopy()
+                        __notable_lines.extend(lineList[:line_index])
+                        lineList = lineList[line_index:]
+                    else:
+                        break
+            __all_table_lines.append(__sub_table_lines)
+        # 表格遍历替换完毕, 合并剩下的 page_words
+        if lineList:
+            __notable_lines.extend(lineList)
+        return __notable_lines, __all_table_lines
+    def more_judge(self, table_bbox, lineList, max_judge=6):
+        '''
+        判断后续行列表是否还存在属于当前表格的行
+        对于表格、行界限不明显的额外判断 如： 页面分栏、表格不全
+        :return 是否存在 True | False
+        '''
+        # 往后多判断 max_judge 行
+        if len(lineList) < max_judge:
+            judge_lines = lineList
+        else:
+            judge_lines = lineList[:max_judge]
+        for index, line in enumerate(judge_lines):
+            line_bbox = line['objPos']
+            iou = self.count_iou(table_bbox, line_bbox)
+            if iou > 0:
+                return index, True
+        return index, False
+    def combine_table_with_line(self, notable_lines, all_table_lines, table_list):
+        '''
+        将行、字符合并至对应的表格行、cell
+        '''
+        for table_id, table in enumerate(table_list):
+            new_table_lines = []
+            for table_line in table['lineList']:
+                is_iter_table = False
+                table_line_bbox = table_line['objPos']
+                # 遍历每一行：全局匹配
+                for __line in all_table_lines[table_id]:
+                    line = copy.deepcopy(__line)
+                    line_bbox = line['objPos']
+                    iou = self.count_iou(table_line_bbox, line_bbox)
+                    # 首次识别到表格， 将文本行的文本、坐标替换为表格行文本、坐标，文本行的其他信息不变
+                    if iou > self.iou_rate and not is_iter_table:
+                        is_iter_table = True
+                        line['objContent'] = table_line['objContent']
+                        line['objPos'] = table_line['objPos']
+                        line['objType'] = 'table'
+                        line['tableId'] = table_id
+                        self.combine_cell_with_span(table_line, line)
+                        line['cells'] = table_line['cells']
+                        new_table_lines.append(line)
+                    elif iou > self.iou_rate and is_iter_table:
+                        self.combine_cell_with_span(table_line, line)
+                    else:
+                        pass
+            if 'table' not in notable_lines or not new_table_lines:
+                # FIX ERROR: 'table' is not in list
+                # 处理大表格内识别到小表格的情况
+                # 有可能的bug：如果此时有多个大表格嵌套会导致行分配和插庄个数不对等
+                continue
+            # 将表格行new_table_lines替换之前插庄table位置并展开
+            table_index = notable_lines.index('table')
+            new_notable_lines = notable_lines[:table_index]
+            new_notable_lines.extend(new_table_lines)
+            notable_lines = new_notable_lines + notable_lines[table_index+1:]
+        return notable_lines
+    def combine_cell_with_span(self,table_line , text_line):
+        '''
+        将表格的cell内加上对应span的chars信息：解决表格合并时cell有多行导致chars顺序错乱的问题
+        '''
+        del_list = []
+        for index, cell in enumerate(table_line['cells']):
+            if not cell.get('chars'):
+                cell['chars'] = []
+            cell_bbox = cell['objPos']
+            if cell_bbox is None:
+                del_list.append(index)
+                continue
+            for span in  text_line['span']:
+                span_bbox = span['bbox']
+                iou = self.count_iou(cell_bbox, span_bbox)
+                if iou < self.iou_rate:
+                    continue
+                # 为了解决一些 span 和 cell 长度不一致问题 将循环细分到每个字符chars
+                for char in span['chars']:
+                    char_bbox = char['bbox']
+                    iou = self.count_iou(cell_bbox, char_bbox)
+                    if iou > self.iou_rate:
+                        cell['chars'].append(char)
+                    else:
+                        pass
+        # 清除无效的span
+        if len(del_list):
+            for index, index_del in enumerate(del_list):
+                index_del -= index
+                del table_line['cells'][index_del]
+    def group_block(self, page_num, fitz_doc):
+        """
+        组合两个方法的block信息, 使每一个span内具有其每一个字符信息
+        参考官方文档：https://pymupdf.readthedocs.io/en/latest/textpage.html#textpagedict
+        :param fitz_doc:
+        :return: total_info
+        """
+        line_count = 0
+        total_line_list = []
+        # char_blocks 最小粒度为每一个字符
+        char_blocks = fitz_doc.get_text('rawdict')['blocks']
+        # block_blocks 最小粒度为每行中的span
+        block_blocks = fitz_doc.get_text('dict')['blocks']
+        # 先进行文本块排序
+        char_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
+        block_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
+        # 分组聚合
+        group_blocks = zip(block_blocks, char_blocks)
+        for span_blocks, char_block in group_blocks:
+            if span_blocks['type'] == 1:
+                # 保存其中的图片
+                img_attrs = self.deal_image(page_num, line_count, span_blocks)
+                self.add_image(page_num, img_attrs)
+                continue
+            for line_index, line in enumerate(span_blocks['lines']):
+                line['text'] = ''
+                line['chars'] = []
+                line['span'] = []
+                # 减少时间复杂度，在此处合并每一行
+                # 合并每一行，并附上行内每一个字符的信息
+                for span_index, span in enumerate(line['spans']):
+                    span['text'] = span['text'].replace(' ', '').strip()
+                    if not span['text']:
+                        continue
+                    # 给span_blocks中的span加上char_block的chars信息
+                    span_chars = char_block['lines'][line_index]['spans'][span_index]['chars']
+                    span_chars = [char for char in span_chars if char['c'].strip()]
+                    line['text'] += span['text']
+                    line['chars'].extend(span_chars)
+                    line['span'].append({'bbox': span['bbox'], 'chars': span_chars,'text': span['text']})
+                if not line['text']:
+                    continue
+                # 构造每行内部的数据结构
+                line_info = self.construct_line_info(line['text'], line['bbox'], line['span'], line['chars'],
+                                                     line_count, page_num)
+                total_line_list.append(line_info)
+                line_count += 1
+        return total_line_list
+    def extract_table(self, page_no, plum_page):
+        '''
+        提取页面所有表格
+        :param page_no:
+        :param plum_page:
+        :return:
+        '''
+        table_list = []
+        for table in plum_page.find_tables():
+            # 获取当前表格的边界定位
+            table_line_list = self.merge_table_row(table)
+            if not table_line_list:
+                continue
+            table_info = self.deal_table(page_no, table.bbox, table_line_list)
+            table_list.append(table_info)
+            # 将表格信息加入全局变量 | 此处有点有点冗余
+            self.add_table(page_no, table_info)
+        return table_list
+    def merge_table_row(self, table):
+        '''
+        表格cell 按行合并
+        :param table:
+        :return: [({line_text}, {line_bbox}), ...]
+        '''
+        table_line_list = []
+        for item, row in zip(table.extract(), table.rows):
+            # 表格每行预处理
+            table_line = self.divide.join([self.clear_text(txt) for txt in item])
+            # 判断当前行是否为空
+            __line = self.clear_text(table_line).replace(' ', '')
+            if not __line:
+                continue
+            table_line_list.append((table_line, row.bbox, zip(item, row.cells)))
+        return table_line_list
+    def clear_text(self, txt, retrans=False):
+        if retrans:
+            txt = txt.replace(self.solid, '').replace(self.divide, '')
+        else:
+            # 空列替换为占位符
+            txt = txt if txt else self.solid
+        return str(txt).replace('\n', '').replace(' ', '')
+    def deal_table(self, page_no, table_bbox, table_line_list):
+        '''
+        对表格做结构转换
+        :param page_no:
+        :param table_bbox:
+        :param table_line_list:
+        :return:
+        '''
+        table_first_line = self.clear_text(table_line_list[0][0], retrans=True)
+        table_id = '{0}_{1}_'.format(page_no, table_first_line) + self.genShortId()
+        lineList = [{
+            'objContent': line[0],
+            'objPos': line[1],
+            'cells': self.deal_table_cell(line[2])
+        } for line in table_line_list]
+        table_info = {
+            'tableId': table_id,
+            'name': table_id,
+            'objPos': table_bbox,
+            'lineList': lineList,
+        }
+        return table_info
+    def deal_table_cell(self, cells):
+        return [{"objContent": self.clear_text(text), "objPos": box} for text, box in cells]
+    def deal_image(self, page_num, name, img_attrs):
+        '''
+        对image做结构转换
+        :param page_num:
+        :param name:
+        :param img_attrs:
+        :return:
+        '''
+        image_id = '{0}_{1}_'.format(page_num, name) + self.genShortId()
+        img_info = {
+            'imageId': image_id,
+            'name': image_id,  # 暂时以图片所在页面的行数命名
+            'objPos': img_attrs['bbox'],
+            'ext': img_attrs['ext'],
+            'objContent': img_attrs['image'],
+            'size': img_attrs['size']
+        }
+        return img_info
+    def deal_chars(self, line_num, lineId, chars):
+        '''
+        对chars做结构转换
+        :param line_num:
+        :param lineId:
+        :param chars:
+        :return:
+        '''
+        num_count = 0
+        char_list = []
+        for char in chars:
+            if not char['c'].strip():
+                continue
+            char_dict = {
+                'lineId': lineId,
+                'charId': 'char_' + str(line_num) + '_' + str(num_count) + '_' + self.genShortId(),
+                'objContent': char['c'],
+                'objPos': char['bbox']
+            }
+            char_list.append(char_dict)
+            num_count += 1
+        return char_list
+    def construct_line_info(self, text, rect, span, chars, count, pageNo, objType='textLine'):
+        '''
+        对每行做结构转换
+        # x, y, h, w = rect[0], rect[1], rect[3] - rect[1], rect[2] - rect[0]
+        '''
+        lineId = 'line_' + str(pageNo) + '_' + str(count) + '_' + self.genShortId()
+        chars = self.deal_chars(count, lineId, chars)
+        return OrderedDict({
+            'lineNo': count,
+            'lineId': lineId,
+            'objType': objType,
+            'objContent': re.sub(r'\s', '', text),
+            'chars': chars,
+            'objPos': rect,
+            'span': span
+        })
+    @staticmethod
+    def rect_format(bbox):
+        '''
+        数据坐标转换 x1, y1, x2, y2 >> y1, x1 h, w
+        :param rect: [x1, y1, x2, y2]
+        :return: [y, x, h, w]
+        '''
+        y, x, h, w = bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
+        return [y, x, h, w]
+    def count_iou(self, RecA, RecB):
+        '''
+        计算边框交并比
+        左上边界坐标为Ax0, Ay0, Bx0, By0
+        右下边界坐标为Ax1, Ay1, Bx1, By1
+        交集面积计算为：
+            M = min(Ax1, Bx1) - max(Ax0, Bx0)
+            H = min(Ay1, By1) - max(Ay0, By0)
+        # 当前表格的边界信息
+        left_x, top_y, right_x, botm_y： table_box_info[0], table_box_info[1], table_box_info[2], table_box_info[3]
+        '''
+        M = min(RecB[2], RecA[2]) - max(RecB[0], RecA[0])
+        H = min(RecB[3], RecA[3]) - max(RecB[1], RecA[1])
+        # 计算交集部分面积
+        interArea = max(0, M) * max(0, H)
+        # 计算两个边框的面积
+        RecA_Area = (RecA[2] - RecA[0]) * (RecA[3] - RecA[1])
+        RecB_Area = (RecB[2] - RecB[0]) * (RecB[3] - RecB[1])
+        # 计算IOU
+        iou = interArea / float(RecA_Area + RecB_Area - interArea)
+        return iou
+    def construct_final_result(self, line_list, pageNo, image_list=[], table_list=[]):
+        '''
+        每页转换为最终数据结构
+        :param line_list: ocr每行结果
+        :param pageNo: 页码
+        :param image_list:
+        :param table_list:
+        :return: type: Dict
+        '''
+        document_id = 'v1' + '_' + self.file_no_suffix + '_' + self.genShortId()
+        return OrderedDict({
+            'pageNo': pageNo,
+            'docID': document_id,
+            'page_info':{'size': [self.width, self.height]},
+            'lineList': line_list,
+            'image_list': image_list if image_list else [],
+            'table_list': table_list if table_list else []
+        })
+    def save_result(self, final_result_list):
+        '''
+        保存结果数据至本地
+        '''
+        if self.table_type == 'v2':
+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
+                json.dump(final_result_list, f, indent=4, ensure_ascii=False)
+        else:
+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
+                json.dump(self.page_result_list, f, cls=MyEncoder, indent=4, ensure_ascii=False)
+    def reform_ocr_result(self, final_result_list):
+        """
+        对返回的结果最最终处理 并 重新定义行号排序
+        :param final_result_list: 本地解析和ocr解析的合并结果
+        """
+        for result_list in final_result_list:
+            del result_list['image_list']
+            del result_list['table_list']
+            lineList = result_list['lineList']
+            for num, line in enumerate(lineList):
+                # 重写行号和行ID
+                line['lineNo'] = str(num)
+                line_split = line['lineId'].split('_')
+                line_split[-2] = str(num)
+                line['lineId'] = '_'.join(line_split)
+                # 转换坐标格式
+                obj_type = line['objType']
+                # 计算每一个字相对于当前行想x，y 的偏移量
+                offset_x_list, offset_y_list = self.coord_offset(line, obj_type)
+                line['objPos'] = self.rect_format(line['objPos'])
+                line['objPos'].append(offset_x_list)
+                line['chars_offset'] = [offset_x_list, offset_y_list]
+                if line.get('chars'):
+                    del line['chars']
+                if obj_type == 'table' and line.get('span'):
+                    del line['span']
+        return final_result_list
+    def coord_offset(self, line, obj_type='textLine'):
+        '''
+        计算每个字符的左上角 相对行左上角位置的偏移量
+        @obj_type: textLine | table
+        '''
+        offset_x_list = []
+        offset_y_list = []
+        line_x, line_y = line['objPos'][0], line['objPos'][1]
+        if obj_type == 'textLine':
+            for span in line['span']:
+                self.all_rect_format(span)
+                for char in span['chars']:
+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
+                    offset_x_list.append(char_x - line_x)
+                    offset_y_list.append(char_y - line_y)
+                    self.all_rect_format(char)
+        else:
+            __cells = []
+            for num, _cell in enumerate(line['cells']):
+                cell = copy.deepcopy(_cell)
+                self.all_rect_format(cell)
+                for char in cell['chars']:
+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
+                    offset_x_list.append(char_x - line_x)
+                    offset_y_list.append(char_y - line_y)
+                    self.all_rect_format(char)
+                __cells.append(cell)
+            line['cells'] = __cells
+        return offset_x_list, offset_y_list
+    def all_rect_format(self, obj):
+        '''
+        将所有格式转换为ocr所需格式
+        '''
+        if 'chars' in obj:
+            if obj.get('text'):
+                obj['objContent'] = obj['text']
+                del obj['text']
+            if obj.get('objPos'):
+                obj['objPos'] = self.rect_format(obj['objPos'])
+            elif obj.get('bbox'):
+                obj['objPos'] = self.rect_format(obj['bbox'])
+                del obj['bbox']
+        else:
+            obj['objContent'] = obj['c']
+            obj['objPos'] = self.rect_format(obj['bbox'])
+            del obj['c']
+            del obj['bbox']
+class CalcTableRL:
+    '''
+    还原表格虚线 计算表格行列合并信息
+    输入目标表格结构信息：必须包含所有的cell坐标
+    在目标表格结构cell上加上row_start_end, col_start_end属性
+    '''
+    def __init__(self, table_info):
+        self.table_info = table_info
+    def run(self):
+        if isinstance(self.table_info, list):
+            for table_info in self.table_info:
+                table_info = self.add_table_property(table_info)
+                yield table_info
+        else:
+            table_info = self.add_table_property(self.table_info)
+            yield table_info
+    def add_table_property(self, table_info):
+        '''
+        表格结构增加行列合并信息:
+        cell['col_start_end'] = (col_start, col_end)
+        cell['row_start_end'] = (row_start, row_end)
+        '''
+        # 分别得到所有排序好的行列坐标
+        set_x, set_y = self.collect_table_coord(table_info)
+        # 排序 后的set_x，set_y 坐标集合就是最小粒度表格
+        list_x, list_y = sorted(set_x), sorted(set_y)
+        for line in table_info['lineList']:
+            for cell in line['cells']:
+                if cell['objPos'] == None:
+                    continue
+                x1, y1, x2, y2 = cell['objPos']
+                # 查找坐标点在虚线表格中对应的位置
+                col_start = list_x.index(x1)
+                col_end = list_x.index(x2)
+                row_start = list_y.index(y1)
+                row_end = list_y.index(y2)
+                cell['col_start_end'] = (col_start, col_end)
+                cell['row_start_end'] = (row_start, row_end)
+                # print(f"{cell['objContent']} 属于行：{cell['row_start_end']} 属于列：{cell['col_start_end']}")
+        return table_info
+    def collect_table_coord(self, table_info):
+        '''
+        获取所有x, y坐标点
+        传入单个表格信息，提取出其中所有cell的x1, y1, x2, y2坐标点 去重
+        :param table_info:
+        :return: set(x), set(y)
+        '''
+        set_x = set()
+        set_y = set()
+        for line in table_info['lineList']:
+            for cell in line['cells']:
+                if cell['objPos'] == None:
+                    continue
+                x1, y1, x2, y2 = cell['objPos']
+                set_x.add(x1)
+                set_x.add(x2)
+                set_y.add(y1)
+                set_y.add(y2)
+        return set_x, set_y
+def pdf_ocr(pdf_path, output_path, table_type='v2', is_save=True):
+    '''
+    简单封装, 方便调用和多线程
+    '''
+    pdf = ParseFile(pdf_path, output_path, table_type, is_save)
+    pdf.get_result()
+    return pdf
+# ---------------------------以下是测试案列-----------------------------------
+@coast_time
+def test_dir():
+    for root in os.walk(r'E:\workplace\cjhx_test\创金和信\pdf2json\input\all_test'):
+        dir, files = root[0], root[2]
+        for file in files:
+            if 'test.pdf' not in file:
+                continue
+            file_path = os.path.join(dir, file)
+            output_dir = r'E:\workplace\cjhx_test\创金和信\pdf2json\file_data\all_test'
+            pdf_ocr_result = pdf_ocr(file_path, output_dir)
+@coast_time
+def test_single():
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\测试足够复杂的表格解析.pdf'
+    file_path = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/20220913-浙江省贰号职业年金计划银华资产组合2022年二季度管理费用支付指令.pdf'
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\公开募集基金销售支付结算机构名录(2022年9月)(1).pdf'
+    # file_path = r'C:\Users\Administrator\Documents\WeChat Files\wxid_x36dhycno4s121\FileStorage\File\2022-11\20210928-ZL001-西部利得天添鑫货币B-申购5000万-确认书.pdf'
+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\2-信息系统部2021年大数据平台系统维护服务--工作记录表和考核表2021Q3-原版.pdf'
+    output_dir = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/电子解析'
+    pdf = pdf_ocr(file_path, output_dir, table_type='v2')
+    # print(pdf.ocr_result)
+@coast_time
+def test_thread():
+    # 多进程
+    from concurrent.futures import ProcessPoolExecutor
+    pool = ProcessPoolExecutor(max_workers=8)
+    # 多线程
+    # from concurrent.futures import ThreadPoolExecutor
+    # pool = ThreadPoolExecutor(max_workers=8)
+    for root in os.walk(r'E:\workplace\daily_work\pdf2json\input\签字模板二'):
+        dir, files = root[0], root[2]
+        for file in files:
+            file_path = os.path.join(dir, file)
+            output_dir = r'E:\workplace\daily_work\pdf2json\output\签字模板二'
+            ret = pool.submit(pdf_ocr, file_path, output_dir, table_type='v2')
+            ret.add_done_callback(print_callback)
+    pool.shutdown()
+def print_callback(ret):
+    # print('ret:', ret.result())
+    pass
+if __name__ == '__main__':
+    # test_dir()
+    # test_thread()
+    # test_single()
+    pdf_obj = DPFParser()
+    with open(r"F:\code\easyofd\test\test.pdf","rb") as f:
+        pdf_bytes = f.read()
+    img_list = pdf_obj.to_img(pdf_bytes)
+    pil_img_list = []
+    for _img in img_list:
+        print(_img.width,_img.height)
+        img = Image.frombytes("RGB", [_img.width, _img.height], _img.samples)
+        print(type(img))
+        img.save('output_image.png')
--- a/magic_pdf/spark/__init__.py
+++ b/magic_pdf/spark/__init__.py
--- a/magic_pdf/spark/spark_api.py
+++ b/magic_pdf/spark/spark_api.py
+from loguru import logger
+from magic_pdf.libs.drop_reason import DropReason
+def get_data_source(jso: dict):
+    data_source = jso.get("data_source")
+    if data_source is None:
+        data_source = jso.get("file_source")
+    return data_source
+def get_data_type(jso: dict):
+    data_type = jso.get("data_type")
+    if data_type is None:
+        data_type = jso.get("file_type")
+    return data_type
+def get_bookid(jso: dict):
+    book_id = jso.get("bookid")
+    if book_id is None:
+        book_id = jso.get("original_file_id")
+    return book_id
+def exception_handler(jso: dict, e):
+    logger.exception(e)
+    jso["_need_drop"] = True
+    jso["_drop_reason"] = DropReason.Exception
+    jso["_exception"] = f"ERROR: {e}"
+    return jso
+def get_bookname(jso: dict):
+    data_source = get_data_source(jso)
+    file_id = jso.get("file_id")
+    book_name = f"{data_source}/{file_id}"
+    return book_name
+def spark_json_extractor(jso: dict) -> dict:
+    """
+    从json中提取数据，返回一个dict
+    """
+    return {
+        "_pdf_type": jso["_pdf_type"],
+        "model_list": jso["doc_layout_result"],
+    }
--- a/magic_pdf/tmp.py
+++ b/magic_pdf/tmp.py
+import html
+def decode_html_entities(text):
+    # 将 HTML 实体转换为相应的字符
+    return html.unescape(text)
+# 示例文本
+text = "这是一个&ast;示例&nbsp;文本，包含&nbsp;HTML&nbsp;转义字符。&#33;"
+# 转换文本中的 HTML 实体
+decoded_text = decode_html_entities(text)
+print(decoded_text)
--- a/magic_pdf/tools/__init__.py
+++ b/magic_pdf/tools/__init__.py
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
+import os
+from pathlib import Path
+import click
+from loguru import logger
+import magic_pdf.model as model_config
+from magic_pdf.libs.version import __version__
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.tools.common import do_parse, parse_pdf_methods
+@click.command()
+@click.version_option(__version__,
+                      '--version',
+                      '-v',
+                      help='display the version and exit')
+@click.option(
+    '-p',
+    '--path',
+    'path',
+    type=click.Path(exists=True),
+    required=True,
+    help='local pdf filepath or directory',
+)
+@click.option(
+    '-o',
+    '--output-dir',
+    'output_dir',
+    type=click.Path(),
+    required=True,
+    help='output local directory',
+)
+@click.option(
+    '-m',
+    '--method',
+    'method',
+    type=parse_pdf_methods,
+    help="""the method for parsing pdf.
+ocr: using ocr technique to extract information from pdf.
+txt: suitable for the text-based pdf only and outperform ocr.
+auto: automatically choose the best method for parsing pdf from ocr and txt.
+without method specified, auto will be used by default.""",
+    default='auto',
+)
+@click.option(
+    '-l',
+    '--lang',
+    'lang',
+    type=str,
+    help="""
+    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
+    You should input "Abbreviation" with language form url:
+    https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
+    """,
+    default=None,
+)
+@click.option(
+    '-d',
+    '--debug',
+    'debug_able',
+    type=bool,
+    help='Enables detailed debugging information during the execution of the CLI commands.',
+    default=False,
+)
+@click.option(
+    '-s',
+    '--start',
+    'start_page_id',
+    type=int,
+    help='The starting page for PDF parsing, beginning from 0.',
+    default=0,
+)
+@click.option(
+    '-e',
+    '--end',
+    'end_page_id',
+    type=int,
+    help='The ending page for PDF parsing, beginning from 0.',
+    default=None,
+)
+def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
+    model_config.__use_inside_model__ = True
+    model_config.__model_mode__ = 'full'
+    os.makedirs(output_dir, exist_ok=True)
+    def read_fn(path):
+        disk_rw = DiskReaderWriter(os.path.dirname(path))
+        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
+    def parse_doc(doc_path: str):
+        try:
+            file_name = str(Path(doc_path).stem)
+            pdf_data = read_fn(doc_path)
+            do_parse(
+                output_dir,
+                file_name,
+                pdf_data,
+                [],
+                method,
+                debug_able,
+                start_page_id=start_page_id,
+                end_page_id=end_page_id,
+                lang=lang
+            )
+        except Exception as e:
+            logger.exception(e)
+    if os.path.isdir(path):
+        for doc_path in Path(path).glob('*.pdf'):
+            parse_doc(doc_path)
+    else:
+        parse_doc(path)
+if __name__ == '__main__':
+    cli()