Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files

Deleted magic_pdf/pycache/init.cpython-310.pyc,...
Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
826086d2 · zhougaofeng · 57aaa1cf · 57aaa1cf · 57aaa1cf · 57aaa1cf
Commit 826086d2 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/__init__.py
+++ b/magic_pdf/__init__.py
--- a/magic_pdf/__pycache__/__init__.cpython-310.pyc
+++ b/magic_pdf/__pycache__/__init__.cpython-310.pyc
--- a/magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc
+++ b/magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc
--- a/magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc
+++ b/magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc
--- a/magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc
+++ b/magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc
--- a/magic_pdf/__pycache__/user_api.cpython-310.pyc
+++ b/magic_pdf/__pycache__/user_api.cpython-310.pyc
--- a/magic_pdf/config.ini
+++ b/magic_pdf/config.ini
-[server]
-pdf_server = http://0.0.0.0:4090
-
-ocr_server = http://0.0.0.0:4080
--- a/magic_pdf/dict2md/__init__.py
+++ b/magic_pdf/dict2md/__init__.py
--- a/magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc
+++ b/magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc
--- a/magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc
+++ b/magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc
--- a/magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc
+++ b/magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc
--- a/magic_pdf/dict2md/mkcontent.py
+++ b/magic_pdf/dict2md/mkcontent.py
-import math
-from loguru import logger
-
-from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.ocr_content_type import ContentType
-
-TYPE_INLINE_EQUATION = ContentType.InlineEquation
-TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
-UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
-
-
-@DeprecationWarning
-def mk_nlp_markdown_1(para_dict: dict):
-    """
-    对排序后的bboxes拼接内容
-    """
-    content_lst = []
-    for _, page_info in para_dict.items():
-        para_blocks = page_info.get("para_blocks")
-        if not para_blocks:
-            continue
-
-        for block in para_blocks:
-            item = block["paras"]
-            for _, p in item.items():
-                para_text = p["para_text"]
-                is_title = p["is_para_title"]
-                title_level = p['para_title_level']
-                md_title_prefix = "#"*title_level
-                if is_title:
-                    content_lst.append(f"{md_title_prefix} {para_text}")
-                else:
-                    content_lst.append(para_text)
-
-    content_text = "\n\n".join(content_lst)
-
-    return content_text
-
-
-
-# 找到目标字符串在段落中的索引
-def __find_index(paragraph, target):
-    index = paragraph.find(target)
-    if index != -1:
-        return index
-    else:
-        return None
-
-
-def __insert_string(paragraph, target, postion):
-    new_paragraph = paragraph[:postion] + target + paragraph[postion:] 
-    return new_paragraph
-
-
-def __insert_after(content, image_content, target):
-    """
-    在content中找到target，将image_content插入到target后面
-    """
-    index = content.find(target)
-    if index != -1:
-        content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
-    else:
-        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
-    return content
-
-def __insert_before(content, image_content, target):
-    """
-    在content中找到target，将image_content插入到target前面
-    """
-    index = content.find(target)
-    if index != -1:
-        content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
-    else:
-        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
-    return content
-
-
-@DeprecationWarning
-def mk_mm_markdown_1(para_dict: dict):
-    """拼装多模态markdown"""
-    content_lst = []
-    for _, page_info in para_dict.items():
-        page_lst = [] # 一个page内的段落列表
-        para_blocks = page_info.get("para_blocks")
-        pymu_raw_blocks = page_info.get("preproc_blocks")
-        
-        all_page_images = []
-        all_page_images.extend(page_info.get("images",[]))
-        all_page_images.extend(page_info.get("image_backup", []) )
-        all_page_images.extend(page_info.get("tables",[]))
-        all_page_images.extend(page_info.get("table_backup",[]) )
-        
-        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
-            for img in all_page_images:
-                page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
-            page_md = "\n\n".join(page_lst)
-            
-        else:
-            for block in para_blocks:
-                item = block["paras"]
-                for _, p in item.items():
-                    para_text = p["para_text"]
-                    is_title = p["is_para_title"]
-                    title_level = p['para_title_level']
-                    md_title_prefix = "#"*title_level
-                    if is_title:
-                        page_lst.append(f"{md_title_prefix} {para_text}")
-                    else:
-                        page_lst.append(para_text)
-                        
-            """拼装成一个页面的文本"""
-            page_md = "\n\n".join(page_lst)
-            """插入图片"""
-            for img in all_page_images:
-                imgbox = img['bbox']
-                img_content = f"![]({img['image_path']})"
-                # 先看在哪个block内
-                for block in pymu_raw_blocks:
-                    bbox = block['bbox']
-                    if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
-                        for l in block['lines']:
-                            line_box = l['bbox']
-                            if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的，插入line前面
-                                line_txt = "".join([s['text'] for s in l['spans']])
-                                page_md = __insert_before(page_md, img_content, line_txt)
-                                break
-                            break
-                        else:# 在行与行之间
-                            # 找到图片x0,y0与line的x0,y0最近的line
-                            min_distance = 100000
-                            min_line = None
-                            for l in block['lines']:
-                                line_box = l['bbox']
-                                distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
-                                if distance < min_distance:
-                                    min_distance = distance
-                                    min_line = l
-                            if min_line:
-                                line_txt = "".join([s['text'] for s in min_line['spans']])
-                                img_h = imgbox[3] - imgbox[1]
-                                if min_distance<img_h: # 文字在图片前面
-                                    page_md = __insert_after(page_md, img_content, line_txt)
-                                else:
-                                    page_md = __insert_before(page_md, img_content, line_txt)
-                            else:
-                                logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
-                else:# 应当在两个block之间
-                    # 找到上方最近的block，如果上方没有就找大下方最近的block
-                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
-                    if top_txt_block:
-                        line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
-                        page_md = __insert_after(page_md, img_content, line_txt)
-                    else:
-                        bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
-                        if bottom_txt_block:
-                            line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
-                            page_md = __insert_before(page_md, img_content, line_txt)
-                        else:
-                            logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
-                    
-        content_lst.append(page_md)
-                    
-    """拼装成全部页面的文本"""
-    content_text = "\n\n".join(content_lst)
-
-    return content_text
-
-
-def __insert_after_para(text, type, element, content_list):
-    """
-    在content_list中找到text，将image_path作为一个新的node插入到text后面
-    """
-    for i, c in enumerate(content_list):
-        content_type = c.get("type")
-        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
-            if type == "image":
-                content_node = {
-                    "type": "image",
-                    "img_path": element.get("image_path"),
-                    "img_alt": "",
-                    "img_title": "",
-                    "img_caption": "",
-                }
-            elif type == "table":
-                content_node = {
-                    "type": "table",
-                    "img_path": element.get("image_path"),
-                    "table_latex": element.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": element.get("quality"),
-                }
-            content_list.insert(i+1, content_node)
-            break
-    else:
-        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
-    
-
-
-def __insert_before_para(text, type, element, content_list):
-    """
-    在content_list中找到text，将image_path作为一个新的node插入到text前面
-    """
-    for i, c in enumerate(content_list):
-        content_type = c.get("type")
-        if content_type in  UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
-            if type == "image":
-                content_node = {
-                    "type": "image",
-                    "img_path": element.get("image_path"),
-                    "img_alt": "",
-                    "img_title": "",
-                    "img_caption": "",
-                }
-            elif type == "table":
-                content_node = {
-                    "type": "table",
-                    "img_path": element.get("image_path"),
-                    "table_latex": element.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": element.get("quality"),
-                }
-            content_list.insert(i, content_node)
-            break
-    else:
-        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
-         
-
-def mk_universal_format(pdf_info_list: list, img_buket_path):
-    """
-    构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
-    """
-    content_lst = []
-    for page_info in pdf_info_list:
-        page_lst = [] # 一个page内的段落列表
-        para_blocks = page_info.get("para_blocks")
-        pymu_raw_blocks = page_info.get("preproc_blocks")
-        
-        all_page_images = []
-        all_page_images.extend(page_info.get("images",[]))
-        all_page_images.extend(page_info.get("image_backup", []) )
-        # all_page_images.extend(page_info.get("tables",[]))
-        # all_page_images.extend(page_info.get("table_backup",[]) )
-        all_page_tables = []
-        all_page_tables.extend(page_info.get("tables", []))
-
-        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
-            for img in all_page_images:
-                content_node = {
-                    "type": "image",
-                    "img_path": join_path(img_buket_path, img['image_path']),
-                    "img_alt":"",
-                    "img_title":"",
-                    "img_caption":""
-                }
-                page_lst.append(content_node) # TODO 图片顺序
-            for table in all_page_tables:
-                content_node = {
-                    "type": "table",
-                    "img_path": join_path(img_buket_path, table['image_path']),
-                    "table_latex": table.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": table.get("quality"),
-                }
-                page_lst.append(content_node) # TODO 图片顺序
-        else:
-            for block in para_blocks:
-                item = block["paras"]
-                for _, p in item.items():
-                    font_type = p['para_font_type']# 对于文本来说，要么是普通文本，要么是个行间公式
-                    if font_type == TYPE_INTERLINE_EQUATION:
-                        content_node = {
-                            "type": "equation",
-                            "latex": p["para_text"]
-                        }
-                        page_lst.append(content_node)
-                    else:
-                        para_text = p["para_text"]
-                        is_title = p["is_para_title"]
-                        title_level = p['para_title_level']
-                        
-                        if is_title:
-                            content_node = {
-                                "type": f"h{title_level}",
-                                "text": para_text
-                            }
-                            page_lst.append(content_node)
-                        else:
-                            content_node = {
-                                "type": "text",
-                                "text": para_text
-                            }
-                            page_lst.append(content_node)
-                            
-        content_lst.extend(page_lst)
-        
-        """插入图片"""
-        for img in all_page_images:
-            insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
-
-        """插入表格"""
-        for table in all_page_tables:
-            insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
-    # end for
-    return content_lst
-
-
-def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
-    element_bbox = element['bbox']
-    # 先看在哪个block内
-    for block in pymu_raw_blocks:
-        bbox = block['bbox']
-        if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
-            3] + 1:  # 确定在这个大的block内，然后进入逐行比较距离
-            for l in block['lines']:
-                line_box = l['bbox']
-                if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
-                    3] + 1:  # 在line内的，插入line前面
-                    line_txt = "".join([s['text'] for s in l['spans']])
-                    __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                break
-            else:  # 在行与行之间
-                # 找到图片x0,y0与line的x0,y0最近的line
-                min_distance = 100000
-                min_line = None
-                for l in block['lines']:
-                    line_box = l['bbox']
-                    distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
-                    if distance < min_distance:
-                        min_distance = distance
-                        min_line = l
-                if min_line:
-                    line_txt = "".join([s['text'] for s in min_line['spans']])
-                    img_h = element_bbox[3] - element_bbox[1]
-                    if min_distance < img_h:  # 文字在图片前面
-                        __insert_after_para(line_txt, type, element, content_lst)
-                    else:
-                        __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                else:
-                    logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
-    else:  # 应当在两个block之间
-        # 找到上方最近的block，如果上方没有就找大下方最近的block
-        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
-        if top_txt_block:
-            line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
-            __insert_after_para(line_txt, type, element, content_lst)
-        else:
-            bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
-            if bottom_txt_block:
-                line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
-                __insert_before_para(line_txt, type, element, content_lst)
-            else:  # TODO ，图片可能独占一列，这种情况上下是没有图片的
-                logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
-
-
-def mk_mm_markdown(content_list):
-    """
-    基于同一格式的内容列表，构造markdown，含图片
-    """
-    content_md = []
-    for c in content_list:
-        content_type = c.get("type")
-        if content_type == "text":
-            content_md.append(c.get("text"))
-        elif content_type == "equation":
-            content = c.get("latex")
-            if content.startswith("$$") and content.endswith("$$"):
-                content_md.append(content)
-            else:
-                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-        elif content_type == "image":
-            content_md.append(f"![]({c.get('img_path')})")
-    return "\n\n".join(content_md)
-
-def mk_nlp_markdown(content_list):
-    """
-    基于同一格式的内容列表，构造markdown，不含图片
-    """
-    content_md = []
-    for c in content_list:
-        content_type = c.get("type")
-        if content_type == "text":
-            content_md.append(c.get("text"))
-        elif content_type == "equation":
-            content_md.append(f"$$\n{c.get('latex')}\n$$")
-        elif content_type == "table":
-            content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-    return "\n\n".join(content_md)
\ No newline at end of file
--- a/magic_pdf/dict2md/ocr_client.py
+++ b/magic_pdf/dict2md/ocr_client.py
-import configparser
-import os
-import json
-import requests
-from loguru import logger
-import argparse
-import time
-from PIL import Image
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--config_path',
-        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
-    )
-    parser.add_argument(
-        '--image_path',
-        default='/home/wanglch/projects/Qwen2-VL/20240920-163701.png',
-    )
-    parser.add_argument(
-        '--text',
-        default="描述你在图片中看到的内容",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line.strip() != ""]  # 去除空行
-    count = 0
-    parsed_lines = []
-
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                # 开始代码块
-                parsed_lines.append(f'<pre><code class="language-{items[-1]}">')
-            else:
-                # 结束代码块
-                parsed_lines.append(f"</code></pre>")
-        else:
-            if i > 0 and count % 2 == 1:
-                # 转义代码块内的特殊字符
-                line = line.replace("`", r"\`")
-                line = line.replace("<", "&lt;")
-                line = line.replace(">", "&gt;")
-                line = line.replace(" ", "&nbsp;")
-                line = line.replace("*", "&ast;")
-                line = line.replace("_", "&lowbar;")
-                line = line.replace("-", "&#45;")
-                line = line.replace(".", "&#46;")
-                line = line.replace("!", "&#33;")
-                line = line.replace("(", "&#40;")
-                line = line.replace(")", "&#41;")
-                line = line.replace("$", "&#36;")
-            # 使用空格连接行
-            if parsed_lines:
-                parsed_lines[-1] += " " + line
-            else:
-                parsed_lines.append(line)
-
-    text = "".join(parsed_lines)
-    return text
-
-
-def unparse_text(parsed_text):
-    in_code_block = False
-    lines = parsed_text.split("\n")
-    unparsed_lines = []
-
-    for line in lines:
-        if "<pre><code" in line:
-            in_code_block = True
-            # 移除开始标签
-            line = line.split(">", 1)[1]
-        elif "</code></pre>" in line:
-            in_code_block = False
-            # 移除结束标签
-            line = line.rsplit("<", 1)[0]
-
-        # 反转 HTML 实体
-        line = line.replace("&lt;", "<")
-        line = line.replace("&gt;", ">")
-        line = line.replace("&nbsp;", " ")
-        line = line.replace("&ast;", "*")
-        line = line.replace("&lowbar;", "_")
-        line = line.replace("&#45;", "-")
-        line = line.replace("&#46;", ".")
-        line = line.replace("&#33;", "!")
-        line = line.replace("&#40;", "(")
-        line = line.replace("&#41;", ")")
-        line = line.replace("&#36;", "$")
-
-        # 如果在代码块内，还原反斜杠转义
-        if in_code_block:
-            line = line.replace(r"\`", "`")
-
-        unparsed_lines.append(line)
-
-    # 合并所有行
-    unparsed_text = "\n".join(unparsed_lines)
-    return unparsed_text
-
-
-def compress_image(image_path, max_size=(1024, 1024)):
-    img = Image.open(image_path)
-    width, height = img.size
-    aspect_ratio = width / height
-
-    if width > max_size[0] or height > max_size[1]:
-        if width > height:
-            new_width = max_size[0]
-            new_height = int(new_width / aspect_ratio)
-        else:
-            new_height = max_size[1]
-            new_width = int(new_height * aspect_ratio)
-
-        img = img.resize((new_width, new_height), Image.LANCZOS)
-        img.save(image_path, optimize=True, quality=80)
-
-
-class PredictClient:
-    def __init__(self, api_url):
-        self.api_url = api_url
-
-    def check_health(self):
-        health_check_url = f'{self.api_url}/health'
-        try:
-            response = requests.get(health_check_url)
-            if response.status_code == 200:
-                logger.info("Server is healthy and ready to process requests.")
-                return True
-            else:
-                logger.error(f'Server health check failed with status code:{response.status_code}')
-                return False
-        except requests.exceptions.RequestException as e:
-            logger.error(f'Health check request failed:{e}')
-            return False
-
-
-    def predict(self, image_path: str, text: str):
-        payload = {
-            "image_path": image_path,
-            "text": text
-        }
-        headers = {'Content-Type': 'application/json'}
-        response = requests.post(f"{self.api_url}/predict", json=payload, headers=headers)
-
-        if response.status_code == 200:
-            result = response.json()
-            return result.get('Generated Text', '')
-        else:
-            raise Exception(f"Predict API request failed with status code {response.status_code}")
-
-
-def main():
-    args = parse_args()
-
-    config = configparser.ConfigParser()
-    config.read(args.config_path)
-    ocr_server = config.get('server', 'ocr_server')
-    client = PredictClient(ocr_server)
-    try:
-        start_time = time.time()  # 记录开始时间
-        # 压缩图片
-        #compress_image(args.image_path)
-
-        generated_text = client.predict(args.image_path, parse_text(args.text))
-        end_time = time.time()  # 记录结束时间
-        elapsed_time = end_time - start_time  # 计算运行时间
-
-        if generated_text:
-            clean_text = unparse_text(generated_text)  # 解析生成的文本
-            logger.info(f"Image Path: {args.image_path}")
-            logger.info(f"Generated Text: {clean_text}")
-            logger.info(f"耗时为: {elapsed_time}秒")  # 打印运行时间
-        else:
-            logger.warning("Received empty generated text.")
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Error while making request to predict service: {e}")
-    except Exception as e:
-        logger.error(f"Unexpected error occurred: {e}")
-
-
-if __name__ == "__main__":
-    main()
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
--- a/magic_pdf/dict2md/ocr_server.py
+++ b/magic_pdf/dict2md/ocr_server.py
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree
-import configparser
-import copy
-import re
-import gc
-import time
-
-import torch
-from argparse import ArgumentParser
-from threading import Thread
-from qwen_vl_utils import process_vision_info
-from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, TextIteratorStreamer
-
-from fastapi import FastAPI
-from pydantic import BaseModel
-from typing import Optional
-from loguru import logger
-
-app = FastAPI()
-
-DEFAULT_CKPT_PATH = '/home/practice/model/Qwen2-VL-7B-Instruct'
-REVISION = 'v1.0.4'
-BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
-PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
-
-logger.add("parse.log", rotation="10 MB", level="INFO",
-           format="{time} {level} {message}", encoding='utf-8', enqueue=True)
-
-def _get_args():
-    parser = ArgumentParser()
-
-    parser.add_argument('-c', '--checkpoint_path', type=str, default=DEFAULT_CKPT_PATH,
-                        help='Checkpoint name or path, default to %(default)r')
-    parser.add_argument('--cpu_only', action='store_true', help='Run demo with CPU only')
-    parser.add_argument('--flash_attn2', action='store_true', default=False,
-                        help='Enable flash_attention_2 when loading the model.')
-    parser.add_argument('--share', action='store_true', default=False,
-                        help='Create a publicly shareable link for the interface.')
-    parser.add_argument('--inbrowser', action='store_true', default=False,
-                        help='Automatically launch the interface in a new tab on the default browser.')
-    parser.add_argument('--dcu_id', type=str, default='0', help='Specify the GPU ID to load the model onto.')
-    parser.add_argument(
-        '--config_path',
-        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
-        )
-    args = parser.parse_args()
-    return args
-
-
-def _load_model_processor(args):
-    if args.cpu_only:
-        device_map = 'cpu'
-    else:
-        if args.dcu_id is not None:
-            device_map = {'': f'cuda:{args.dcu_id}'}
-            print('使用DCU推理:', f'cuda:{args.dcu_id}')
-        else:
-            device_map = 'auto'
-
-    if args.flash_attn2:
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            args.checkpoint_path,
-            torch_dtype=torch.float16,
-            attn_implementation='flash_attention_2',
-            device_map=device_map
-        )
-    else:
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            args.checkpoint_path,
-            torch_dtype=torch.float16,
-            device_map=device_map
-        )
-
-    processor = AutoProcessor.from_pretrained(args.checkpoint_path)
-    return model, processor
-
-
-def _parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line.strip() != ""]  # 去除空行
-    count = 0
-    parsed_lines = []
-
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                # 开始代码块
-                parsed_lines.append(f'<pre><code class="language-{items[-1]}">')
-            else:
-                # 结束代码块
-                parsed_lines.append(f"</code></pre>")
-        else:
-            if i > 0 and count % 2 == 1:
-                # 转义代码块内的特殊字符
-                line = line.replace("`", r"\`")
-                line = line.replace("<", "&lt;")
-                line = line.replace(">", "&gt;")
-                line = line.replace(" ", "&nbsp;")
-                line = line.replace("*", "&ast;")
-                line = line.replace("_", "&lowbar;")
-                line = line.replace("-", "&#45;")
-                line = line.replace(".", "&#46;")
-                line = line.replace("!", "&#33;")
-                line = line.replace("(", "&#40;")
-                line = line.replace(")", "&#41;")
-                line = line.replace("$", "&#36;")
-            # 使用空格连接行
-            if parsed_lines:
-                parsed_lines[-1] += " " + line
-            else:
-                parsed_lines.append(line)
-
-    text = "".join(parsed_lines)
-    return text
-
-
-def _remove_image_special(text):
-    text = text.replace('<ref>', '').replace('</ref>', '')
-    return re.sub(r'<box>.*?(</box>|$)', '', text)
-
-
-def _is_video_file(filename):
-    video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
-    return any(filename.lower().endswith(ext) for ext in video_extensions)
-
-
-def _transform_messages(original_messages):
-    transformed_messages = []
-    for message in original_messages:
-        new_content = []
-        for item in message['content']:
-            if 'image' in item:
-                new_item = {'type': 'image', 'image': item['image']}
-            elif 'text' in item:
-                new_item = {'type': 'text', 'text': item['text']}
-            elif 'video' in item:
-                new_item = {'type': 'video', 'video': item['video']}
-            else:
-                continue
-            new_content.append(new_item)
-
-        new_message = {'role': message['role'], 'content': new_content}
-        transformed_messages.append(new_message)
-
-    return transformed_messages
-
-
-def _gc():
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-
-
-def call_local_model(model, processor, messages):
-    messages = _transform_messages(messages)
-
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt')
-    inputs = inputs.to(model.device)
-
-    tokenizer = processor.tokenizer
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-
-    gen_kwargs = {'max_new_tokens': 512, 'streamer': streamer, **inputs}
-
-    thread = Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
-
-    generated_text = ''
-    for new_text in streamer:
-        generated_text += new_text
-        yield _parse_text(generated_text)
-
-
-def create_predict_fn(model, processor):
-    def predict(_chatbot, task_history):
-        chat_query = _chatbot[-1][0]
-        query = task_history[-1][0]
-        if len(chat_query) == 0:
-            _chatbot.pop()
-            task_history.pop()
-            return _chatbot
-        print('User: ' + _parse_text(query))
-        history_cp = copy.deepcopy(task_history)
-        full_response = ''
-        messages = []
-        content = []
-        for q, a in history_cp:
-            if isinstance(q, (tuple, list)):
-                if _is_video_file(q[0]):
-                    content.append({'video': f'file://{q[0]}'})
-                else:
-                    content.append({'image': f'file://{q[0]}'})
-            else:
-                content.append({'text': q})
-                messages.append({'role': 'user', 'content': content})
-                messages.append({'role': 'assistant', 'content': [{'text': a}]})
-                content = []
-        messages.pop()
-
-        for response in call_local_model(model, processor, messages):
-            _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
-
-            yield _chatbot
-            full_response = _parse_text(response)
-
-        task_history[-1] = (query, full_response)
-        print('Qwen-VL-Chat: ' + _parse_text(full_response))
-        yield _chatbot
-
-    return predict
-
-
-# 启用加载模型
-args = _get_args()
-model, processor = _load_model_processor(args)
-
-
-class Item(BaseModel):
-    image_path: str
-    text: str
-
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy"}
-
-
-@app.post("/predict")
-async def predict(item: Item):
-    messages = [
-        {
-            'role': 'user',
-            'content': [
-                {'image': item.image_path},
-                {'text': item.text}
-            ]
-        }
-    ]
-    start = time.time()
-    generated_text = ''
-    for response in call_local_model(model, processor, messages):
-        generated_text = _parse_text(response)
-
-    _gc()
-    end = time.time()
-    logger.info(f'【{item.image_path}】解析的结果是：{generated_text},耗时为：{end-start}')
-    return {"Generated Text": generated_text}
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    args = _get_args()
-    config = configparser.ConfigParser()
-    config.read(args.config_path)
-    # host = config.get('server', 'ocr_host')
-    host, port = config.get('server', 'ocr_server').split('://')[1].split(':')[0], int(
-        config.get('server', 'ocr_server').split('://')[1].split(':')[1])
-    # port = int(config.get('server', 'ocr_port'))
-    uvicorn.run(app, host=host, port=port)
-
-
-
--- a/magic_pdf/filter/__init__.py
+++ b/magic_pdf/filter/__init__.py
--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
-"""
-输入： s3路径，每行一个
-输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置
-"""
-import sys
-import click
-
-from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
-from magic_pdf.libs.commons import fitz
-from loguru import logger
-from collections import Counter
-
-from magic_pdf.libs.drop_reason import DropReason
-from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars
-
-scan_max_page = 50
-junk_limit_min = 10
-
-
-def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
-                               result]
-    page_area = int(page_width_pts) * int(page_height_pts)
-    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
-    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
-    return max_image_area_per_page
-
-
-def process_image(page, junk_img_bojids=[]):
-    page_result = []  # 存每个页面里的多张图四元组信息
-    items = page.get_images()
-    dedup = set()
-    for img in items:
-        # 这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
-        img_bojid = img[0]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
-        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
-            continue
-        recs = page.get_image_rects(img, transform=True)
-        if recs:
-            rec = recs[0][0]
-            x0, y0, x1, y1 = map(int, rec)
-            width = x1 - x0
-            height = y1 - y0
-            if (x0, y0, x1, y1, img_bojid) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
-                continue
-            if not all([width, height]):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
-                continue
-            dedup.add((x0, y0, x1, y1, img_bojid))
-            page_result.append([x0, y0, x1, y1, img_bojid])
-    return page_result
-
-
-def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
-    """
-    返回每个页面里的图片的四元组，每个页面多个图片。
-    :param doc:
-    :return:
-    """
-    # 使用 Counter 计数 img_bojid 的出现次数
-    img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
-    # 找出出现次数超过 len(doc) 半数的 img_bojid
-
-    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
-
-    junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
-
-    #todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
-    #有两种扫描版，一种文字版，这里可能会有误判
-    #扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
-    #扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
-    #文字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
-    imgs_len_list = [len(page.get_images()) for page in doc]
-
-    special_limit_pages = 10
-
-    # 统一用前十页结果做判断
-    result = []
-    break_loop = False
-    for i, page in enumerate(doc):
-        if break_loop:
-            break
-        if i >= special_limit_pages:
-            break
-        page_result = process_image(page)  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
-        result.append(page_result)
-        for item in result:
-            if not any(item):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
-                if max(imgs_len_list) == min(imgs_len_list) and max(
-                        imgs_len_list) >= junk_limit_min:  # 如果是特殊文字版，就把junklist置空并break
-                    junk_img_bojids = []
-                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
-                    pass
-                break_loop = True
-                break
-    if not break_loop:
-        # 获取前80%的元素
-        top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
-        # 检查前80%的元素是否都相等
-        if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
-
-            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
-            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
-
-            #前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
-            max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
-            if len(max_image_area_per_page) < 0.8 * special_limit_pages:  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
-                junk_img_bojids = []
-            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
-                pass
-        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
-            junk_img_bojids = []
-
-    #正式进入取前50页图片的信息流程
-    result = []
-    for i, page in enumerate(doc):
-        if i >= scan_max_page:
-            break
-        page_result = process_image(page, junk_img_bojids)
-        # logger.info(f"page {i} img_len: {len(page_result)}")
-        result.append(page_result)
-
-    return result, junk_img_bojids
-
-
-def get_pdf_page_size_pts(doc: fitz.Document):
-    page_cnt = len(doc)
-    l: int = min(page_cnt, 50)
-    #把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
-    page_width_list = []
-    page_height_list = []
-    for i in range(l):
-        page = doc[i]
-        page_rect = page.rect
-        page_width_list.append(page_rect.width)
-        page_height_list.append(page_rect.height)
-
-    page_width_list.sort()
-    page_height_list.sort()
-
-    median_width = page_width_list[len(page_width_list) // 2]
-    median_height = page_height_list[len(page_height_list) // 2]
-
-    return median_width, median_height
-
-
-def get_pdf_textlen_per_page(doc: fitz.Document):
-    text_len_lst = []
-    for page in doc:
-        # 拿包含img和text的所有blocks
-        # text_block = page.get_text("blocks")
-        # 拿所有text的blocks
-        # text_block = page.get_text("words")
-        # text_block_len = sum([len(t[4]) for t in text_block])
-        #拿所有text的str
-        text_block = page.get_text("text")
-        text_block_len = len(text_block)
-        # logger.info(f"page {page.number} text_block_len: {text_block_len}")
-        text_len_lst.append(text_block_len)
-
-    return text_len_lst
-
-
-def get_pdf_text_layout_per_page(doc: fitz.Document):
-    """
-    根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
-
-    Args:
-        doc (fitz.Document): PDF文档对象。
-
-    Returns:
-        List[str]: 每一页的文本布局（横向、纵向、未知）。
-
-    """
-    text_layout_list = []
-
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 创建每一页的纵向和横向的文本行数计数器
-        vertical_count = 0
-        horizontal_count = 0
-        text_dict = page.get_text("dict")
-        if "blocks" in text_dict:
-            for block in text_dict["blocks"]:
-                if 'lines' in block:
-                    for line in block["lines"]:
-                        # 获取line的bbox顶点坐标
-                        x0, y0, x1, y1 = line['bbox']
-                        # 计算bbox的宽高
-                        width = x1 - x0
-                        height = y1 - y0
-                        # 计算bbox的面积
-                        area = width * height
-                        font_sizes = []
-                        for span in line['spans']:
-                            if 'size' in span:
-                                font_sizes.append(span['size'])
-                        if len(font_sizes) > 0:
-                            average_font_size = sum(font_sizes) / len(font_sizes)
-                        else:
-                            average_font_size = 10  # 有的line拿不到font_size，先定一个阈值100
-                        if area <= average_font_size ** 2:  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
-                            continue
-                        else:
-                            if 'wmode' in line:  # 通过wmode判断文本方向
-                                if line['wmode'] == 1:  # 判断是否为竖向文本
-                                    vertical_count += 1
-                                elif line['wmode'] == 0:  # 判断是否为横向文本
-                                    horizontal_count += 1
-                        #     if 'dir' in line:  # 通过旋转角度计算判断文本方向
-                        #         # 获取行的 "dir" 值
-                        #         dir_value = line['dir']
-                        #         cosine, sine = dir_value
-                        #         # 计算角度
-                        #         angle = math.degrees(math.acos(cosine))
-                        #
-                        #         # 判断是否为横向文本
-                        #         if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is horizontal:', line_text)
-                        #             horizontal_count += 1
-                        #         # 判断是否为纵向文本
-                        #         elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is vertical:', line_text)
-                        #             vertical_count += 1
-        # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-        # 判断每一页的文本布局
-        if vertical_count == 0 and horizontal_count == 0:  # 该页没有文本，无法判断
-            text_layout_list.append("unknow")
-            continue
-        else:
-            if vertical_count > horizontal_count:  # 该页的文本纵向行数大于横向的
-                text_layout_list.append("vertical")
-            else:  # 该页的文本横向行数大于纵向的
-                text_layout_list.append("horizontal")
-        # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-    return text_layout_list
-
-
-'''定义一个自定义异常用来抛出单页svg太多的pdf'''
-
-
-class PageSvgsTooManyError(Exception):
-    def __init__(self, message="Page SVGs are too many"):
-        self.message = message
-        super().__init__(self.message)
-
-
-def get_svgs_per_page(doc: fitz.Document):
-    svgs_len_list = []
-    for page_id, page in enumerate(doc):
-        # svgs = page.get_drawings()
-        svgs = page.get_cdrawings()  # 切换成get_cdrawings，效率更高
-        len_svgs = len(svgs)
-        if len_svgs >= 3000:
-            raise PageSvgsTooManyError()
-        else:
-            svgs_len_list.append(len_svgs)
-        # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
-    return svgs_len_list
-
-
-def get_imgs_per_page(doc: fitz.Document):
-    imgs_len_list = []
-    for page_id, page in enumerate(doc):
-        imgs = page.get_images()
-        imgs_len_list.append(len(imgs))
-        # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
-
-    return imgs_len_list
-
-
-def get_language(doc: fitz.Document):
-    """
-    获取PDF文档的语言。
-    Args:
-        doc (fitz.Document): PDF文档对象。
-    Returns:
-        str: 文档语言，如 "en-US"。
-    """
-    language_lst = []
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 拿所有text的str
-        text_block = page.get_text("text")
-        page_language = detect_lang(text_block)
-        language_lst.append(page_language)
-
-        # logger.info(f"page_id: {page_id}, page_language: {page_language}")
-
-    # 统计text_language_list中每种语言的个数
-    count_dict = Counter(language_lst)
-    # 输出text_language_list中出现的次数最多的语言
-    language = max(count_dict, key=count_dict.get)
-    return language
-
-
-def check_invalid_chars(pdf_bytes):
-    """
-    乱码检测
-    """
-    return detect_invalid_chars(pdf_bytes)
-
-
-def pdf_meta_scan(pdf_bytes: bytes):
-    """
-    :param s3_pdf_path:
-    :param pdf_bytes: pdf文件的二进制数据
-    几个维度来评价：是否加密，是否需要密码，纸张大小，总页数，是否文字可提取
-    """
-    doc = fitz.open("pdf", pdf_bytes)
-    is_needs_password = doc.needs_pass
-    is_encrypted = doc.is_encrypted
-    total_page = len(doc)
-    if total_page == 0:
-        logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
-        result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
-        return result
-    else:
-        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
-        # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
-
-        # svgs_per_page = get_svgs_per_page(doc)
-        # logger.info(f"svgs_per_page: {svgs_per_page}")
-        imgs_per_page = get_imgs_per_page(doc)
-        # logger.info(f"imgs_per_page: {imgs_per_page}")
-
-        image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
-        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
-        text_len_per_page = get_pdf_textlen_per_page(doc)
-        # logger.info(f"text_len_per_page: {text_len_per_page}")
-        text_layout_per_page = get_pdf_text_layout_per_page(doc)
-        # logger.info(f"text_layout_per_page: {text_layout_per_page}")
-        text_language = get_language(doc)
-        # logger.info(f"text_language: {text_language}")
-        invalid_chars = check_invalid_chars(pdf_bytes)
-        # logger.info(f"invalid_chars: {invalid_chars}")
-
-        # 最后输出一条json
-        res = {
-            "is_needs_password": is_needs_password,
-            "is_encrypted": is_encrypted,
-            "total_page": total_page,
-            "page_width_pts": int(page_width_pts),
-            "page_height_pts": int(page_height_pts),
-            "image_info_per_page": image_info_per_page,
-            "text_len_per_page": text_len_per_page,
-            "text_layout_per_page": text_layout_per_page,
-            "text_language": text_language,
-            # "svgs_per_page": svgs_per_page,
-            "imgs_per_page": imgs_per_page,  # 增加每页img数量list
-            "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
-            "invalid_chars": invalid_chars,
-            "metadata": doc.metadata
-        }
-        # logger.info(json.dumps(res, ensure_ascii=False))
-        return res
-
-
-@click.command()
-@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
-@click.option('--s3-profile', help='s3上的profile')
-def main(s3_pdf_path: str, s3_profile: str):
-    """
-
-    """
-    try:
-        file_content = read_file(s3_pdf_path, s3_profile)
-        pdf_meta_scan(file_content)
-    except Exception as e:
-        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
-        logger.exception(e)
-
-
-if __name__ == '__main__':
-    main()
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
-    # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
-    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
-    # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
-    # doc = fitz.open("pdf", file_content)
-    # text_layout_lst = get_pdf_text_layout_per_page(doc)
-    # print(text_layout_lst)
--- a/magic_pdf/integrations/__init__.py
+++ b/magic_pdf/integrations/__init__.py
--- a/magic_pdf/integrations/rag/__init__.py
+++ b/magic_pdf/integrations/rag/__init__.py