Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files

Deleted magic_pdf/pycache/init.cpython-310.pyc,...
Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
826086d2 · zhougaofeng · 57aaa1cf · 57aaa1cf · 57aaa1cf · 57aaa1cf
Commit 826086d2 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/__init__.py
+++ b/magic_pdf/__init__.py
--- a/magic_pdf/__pycache__/__init__.cpython-310.pyc
+++ b/magic_pdf/__pycache__/__init__.cpython-310.pyc
--- a/magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc
+++ b/magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc
--- a/magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc
+++ b/magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc
--- a/magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc
+++ b/magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc
--- a/magic_pdf/__pycache__/user_api.cpython-310.pyc
+++ b/magic_pdf/__pycache__/user_api.cpython-310.pyc
--- a/magic_pdf/config.ini
+++ b/magic_pdf/config.ini
-[server]
-pdf_server = http://0.0.0.0:4090
-
-ocr_server = http://0.0.0.0:4080
--- a/magic_pdf/dict2md/__init__.py
+++ b/magic_pdf/dict2md/__init__.py
--- a/magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc
+++ b/magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc
--- a/magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc
+++ b/magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc
--- a/magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc
+++ b/magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc
--- a/magic_pdf/dict2md/mkcontent.py
+++ b/magic_pdf/dict2md/mkcontent.py
-import math
-from loguru import logger
-
-from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.ocr_content_type import ContentType
-
-TYPE_INLINE_EQUATION = ContentType.InlineEquation
-TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
-UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
-
-
-@DeprecationWarning
-def mk_nlp_markdown_1(para_dict: dict):
-    """
-    对排序后的bboxes拼接内容
-    """
-    content_lst = []
-    for _, page_info in para_dict.items():
-        para_blocks = page_info.get("para_blocks")
-        if not para_blocks:
-            continue
-
-        for block in para_blocks:
-            item = block["paras"]
-            for _, p in item.items():
-                para_text = p["para_text"]
-                is_title = p["is_para_title"]
-                title_level = p['para_title_level']
-                md_title_prefix = "#"*title_level
-                if is_title:
-                    content_lst.append(f"{md_title_prefix} {para_text}")
-                else:
-                    content_lst.append(para_text)
-
-    content_text = "\n\n".join(content_lst)
-
-    return content_text
-
-
-
-# 找到目标字符串在段落中的索引
-def __find_index(paragraph, target):
-    index = paragraph.find(target)
-    if index != -1:
-        return index
-    else:
-        return None
-
-
-def __insert_string(paragraph, target, postion):
-    new_paragraph = paragraph[:postion] + target + paragraph[postion:] 
-    return new_paragraph
-
-
-def __insert_after(content, image_content, target):
-    """
-    在content中找到target，将image_content插入到target后面
-    """
-    index = content.find(target)
-    if index != -1:
-        content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
-    else:
-        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
-    return content
-
-def __insert_before(content, image_content, target):
-    """
-    在content中找到target，将image_content插入到target前面
-    """
-    index = content.find(target)
-    if index != -1:
-        content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
-    else:
-        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
-    return content
-
-
-@DeprecationWarning
-def mk_mm_markdown_1(para_dict: dict):
-    """拼装多模态markdown"""
-    content_lst = []
-    for _, page_info in para_dict.items():
-        page_lst = [] # 一个page内的段落列表
-        para_blocks = page_info.get("para_blocks")
-        pymu_raw_blocks = page_info.get("preproc_blocks")
-        
-        all_page_images = []
-        all_page_images.extend(page_info.get("images",[]))
-        all_page_images.extend(page_info.get("image_backup", []) )
-        all_page_images.extend(page_info.get("tables",[]))
-        all_page_images.extend(page_info.get("table_backup",[]) )
-        
-        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
-            for img in all_page_images:
-                page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
-            page_md = "\n\n".join(page_lst)
-            
-        else:
-            for block in para_blocks:
-                item = block["paras"]
-                for _, p in item.items():
-                    para_text = p["para_text"]
-                    is_title = p["is_para_title"]
-                    title_level = p['para_title_level']
-                    md_title_prefix = "#"*title_level
-                    if is_title:
-                        page_lst.append(f"{md_title_prefix} {para_text}")
-                    else:
-                        page_lst.append(para_text)
-                        
-            """拼装成一个页面的文本"""
-            page_md = "\n\n".join(page_lst)
-            """插入图片"""
-            for img in all_page_images:
-                imgbox = img['bbox']
-                img_content = f"![]({img['image_path']})"
-                # 先看在哪个block内
-                for block in pymu_raw_blocks:
-                    bbox = block['bbox']
-                    if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
-                        for l in block['lines']:
-                            line_box = l['bbox']
-                            if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的，插入line前面
-                                line_txt = "".join([s['text'] for s in l['spans']])
-                                page_md = __insert_before(page_md, img_content, line_txt)
-                                break
-                            break
-                        else:# 在行与行之间
-                            # 找到图片x0,y0与line的x0,y0最近的line
-                            min_distance = 100000
-                            min_line = None
-                            for l in block['lines']:
-                                line_box = l['bbox']
-                                distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
-                                if distance < min_distance:
-                                    min_distance = distance
-                                    min_line = l
-                            if min_line:
-                                line_txt = "".join([s['text'] for s in min_line['spans']])
-                                img_h = imgbox[3] - imgbox[1]
-                                if min_distance<img_h: # 文字在图片前面
-                                    page_md = __insert_after(page_md, img_content, line_txt)
-                                else:
-                                    page_md = __insert_before(page_md, img_content, line_txt)
-                            else:
-                                logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
-                else:# 应当在两个block之间
-                    # 找到上方最近的block，如果上方没有就找大下方最近的block
-                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
-                    if top_txt_block:
-                        line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
-                        page_md = __insert_after(page_md, img_content, line_txt)
-                    else:
-                        bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
-                        if bottom_txt_block:
-                            line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
-                            page_md = __insert_before(page_md, img_content, line_txt)
-                        else:
-                            logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
-                    
-        content_lst.append(page_md)
-                    
-    """拼装成全部页面的文本"""
-    content_text = "\n\n".join(content_lst)
-
-    return content_text
-
-
-def __insert_after_para(text, type, element, content_list):
-    """
-    在content_list中找到text，将image_path作为一个新的node插入到text后面
-    """
-    for i, c in enumerate(content_list):
-        content_type = c.get("type")
-        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
-            if type == "image":
-                content_node = {
-                    "type": "image",
-                    "img_path": element.get("image_path"),
-                    "img_alt": "",
-                    "img_title": "",
-                    "img_caption": "",
-                }
-            elif type == "table":
-                content_node = {
-                    "type": "table",
-                    "img_path": element.get("image_path"),
-                    "table_latex": element.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": element.get("quality"),
-                }
-            content_list.insert(i+1, content_node)
-            break
-    else:
-        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
-    
-
-
-def __insert_before_para(text, type, element, content_list):
-    """
-    在content_list中找到text，将image_path作为一个新的node插入到text前面
-    """
-    for i, c in enumerate(content_list):
-        content_type = c.get("type")
-        if content_type in  UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
-            if type == "image":
-                content_node = {
-                    "type": "image",
-                    "img_path": element.get("image_path"),
-                    "img_alt": "",
-                    "img_title": "",
-                    "img_caption": "",
-                }
-            elif type == "table":
-                content_node = {
-                    "type": "table",
-                    "img_path": element.get("image_path"),
-                    "table_latex": element.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": element.get("quality"),
-                }
-            content_list.insert(i, content_node)
-            break
-    else:
-        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
-         
-
-def mk_universal_format(pdf_info_list: list, img_buket_path):
-    """
-    构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
-    """
-    content_lst = []
-    for page_info in pdf_info_list:
-        page_lst = [] # 一个page内的段落列表
-        para_blocks = page_info.get("para_blocks")
-        pymu_raw_blocks = page_info.get("preproc_blocks")
-        
-        all_page_images = []
-        all_page_images.extend(page_info.get("images",[]))
-        all_page_images.extend(page_info.get("image_backup", []) )
-        # all_page_images.extend(page_info.get("tables",[]))
-        # all_page_images.extend(page_info.get("table_backup",[]) )
-        all_page_tables = []
-        all_page_tables.extend(page_info.get("tables", []))
-
-        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
-            for img in all_page_images:
-                content_node = {
-                    "type": "image",
-                    "img_path": join_path(img_buket_path, img['image_path']),
-                    "img_alt":"",
-                    "img_title":"",
-                    "img_caption":""
-                }
-                page_lst.append(content_node) # TODO 图片顺序
-            for table in all_page_tables:
-                content_node = {
-                    "type": "table",
-                    "img_path": join_path(img_buket_path, table['image_path']),
-                    "table_latex": table.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": table.get("quality"),
-                }
-                page_lst.append(content_node) # TODO 图片顺序
-        else:
-            for block in para_blocks:
-                item = block["paras"]
-                for _, p in item.items():
-                    font_type = p['para_font_type']# 对于文本来说，要么是普通文本，要么是个行间公式
-                    if font_type == TYPE_INTERLINE_EQUATION:
-                        content_node = {
-                            "type": "equation",
-                            "latex": p["para_text"]
-                        }
-                        page_lst.append(content_node)
-                    else:
-                        para_text = p["para_text"]
-                        is_title = p["is_para_title"]
-                        title_level = p['para_title_level']
-                        
-                        if is_title:
-                            content_node = {
-                                "type": f"h{title_level}",
-                                "text": para_text
-                            }
-                            page_lst.append(content_node)
-                        else:
-                            content_node = {
-                                "type": "text",
-                                "text": para_text
-                            }
-                            page_lst.append(content_node)
-                            
-        content_lst.extend(page_lst)
-        
-        """插入图片"""
-        for img in all_page_images:
-            insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
-
-        """插入表格"""
-        for table in all_page_tables:
-            insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
-    # end for
-    return content_lst
-
-
-def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
-    element_bbox = element['bbox']
-    # 先看在哪个block内
-    for block in pymu_raw_blocks:
-        bbox = block['bbox']
-        if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
-            3] + 1:  # 确定在这个大的block内，然后进入逐行比较距离
-            for l in block['lines']:
-                line_box = l['bbox']
-                if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
-                    3] + 1:  # 在line内的，插入line前面
-                    line_txt = "".join([s['text'] for s in l['spans']])
-                    __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                break
-            else:  # 在行与行之间
-                # 找到图片x0,y0与line的x0,y0最近的line
-                min_distance = 100000
-                min_line = None
-                for l in block['lines']:
-                    line_box = l['bbox']
-                    distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
-                    if distance < min_distance:
-                        min_distance = distance
-                        min_line = l
-                if min_line:
-                    line_txt = "".join([s['text'] for s in min_line['spans']])
-                    img_h = element_bbox[3] - element_bbox[1]
-                    if min_distance < img_h:  # 文字在图片前面
-                        __insert_after_para(line_txt, type, element, content_lst)
-                    else:
-                        __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                else:
-                    logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
-    else:  # 应当在两个block之间
-        # 找到上方最近的block，如果上方没有就找大下方最近的block
-        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
-        if top_txt_block:
-            line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
-            __insert_after_para(line_txt, type, element, content_lst)
-        else:
-            bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
-            if bottom_txt_block:
-                line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
-                __insert_before_para(line_txt, type, element, content_lst)
-            else:  # TODO ，图片可能独占一列，这种情况上下是没有图片的
-                logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
-
-
-def mk_mm_markdown(content_list):
-    """
-    基于同一格式的内容列表，构造markdown，含图片
-    """
-    content_md = []
-    for c in content_list:
-        content_type = c.get("type")
-        if content_type == "text":
-            content_md.append(c.get("text"))
-        elif content_type == "equation":
-            content = c.get("latex")
-            if content.startswith("$$") and content.endswith("$$"):
-                content_md.append(content)
-            else:
-                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-        elif content_type == "image":
-            content_md.append(f"![]({c.get('img_path')})")
-    return "\n\n".join(content_md)
-
-def mk_nlp_markdown(content_list):
-    """
-    基于同一格式的内容列表，构造markdown，不含图片
-    """
-    content_md = []
-    for c in content_list:
-        content_type = c.get("type")
-        if content_type == "text":
-            content_md.append(c.get("text"))
-        elif content_type == "equation":
-            content_md.append(f"$$\n{c.get('latex')}\n$$")
-        elif content_type == "table":
-            content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-    return "\n\n".join(content_md)
\ No newline at end of file
--- a/magic_pdf/dict2md/ocr_client.py
+++ b/magic_pdf/dict2md/ocr_client.py
-import configparser
-import os
-import json
-import requests
-from loguru import logger
-import argparse
-import time
-from PIL import Image
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--config_path',
-        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
-    )
-    parser.add_argument(
-        '--image_path',
-        default='/home/wanglch/projects/Qwen2-VL/20240920-163701.png',
-    )
-    parser.add_argument(
-        '--text',
-        default="描述你在图片中看到的内容",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line.strip() != ""]  # 去除空行
-    count = 0
-    parsed_lines = []
-
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                # 开始代码块
-                parsed_lines.append(f'<pre><code class="language-{items[-1]}">')
-            else:
-                # 结束代码块
-                parsed_lines.append(f"</code></pre>")
-        else:
-            if i > 0 and count % 2 == 1:
-                # 转义代码块内的特殊字符
-                line = line.replace("`", r"\`")
-                line = line.replace("<", "&lt;")
-                line = line.replace(">", "&gt;")
-                line = line.replace(" ", "&nbsp;")
-                line = line.replace("*", "&ast;")
-                line = line.replace("_", "&lowbar;")
-                line = line.replace("-", "&#45;")
-                line = line.replace(".", "&#46;")
-                line = line.replace("!", "&#33;")
-                line = line.replace("(", "&#40;")
-                line = line.replace(")", "&#41;")
-                line = line.replace("$", "&#36;")
-            # 使用空格连接行
-            if parsed_lines:
-                parsed_lines[-1] += " " + line
-            else:
-                parsed_lines.append(line)
-
-    text = "".join(parsed_lines)
-    return text
-
-
-def unparse_text(parsed_text):
-    in_code_block = False
-    lines = parsed_text.split("\n")
-    unparsed_lines = []
-
-    for line in lines:
-        if "<pre><code" in line:
-            in_code_block = True
-            # 移除开始标签
-            line = line.split(">", 1)[1]
-        elif "</code></pre>" in line:
-            in_code_block = False
-            # 移除结束标签
-            line = line.rsplit("<", 1)[0]
-
-        # 反转 HTML 实体
-        line = line.replace("&lt;", "<")
-        line = line.replace("&gt;", ">")
-        line = line.replace("&nbsp;", " ")
-        line = line.replace("&ast;", "*")
-        line = line.replace("&lowbar;", "_")
-        line = line.replace("&#45;", "-")
-        line = line.replace("&#46;", ".")
-        line = line.replace("&#33;", "!")
-        line = line.replace("&#40;", "(")
-        line = line.replace("&#41;", ")")
-        line = line.replace("&#36;", "$")
-
-        # 如果在代码块内，还原反斜杠转义
-        if in_code_block:
-            line = line.replace(r"\`", "`")
-
-        unparsed_lines.append(line)
-
-    # 合并所有行
-    unparsed_text = "\n".join(unparsed_lines)
-    return unparsed_text
-
-
-def compress_image(image_path, max_size=(1024, 1024)):
-    img = Image.open(image_path)
-    width, height = img.size
-    aspect_ratio = width / height
-
-    if width > max_size[0] or height > max_size[1]:
-        if width > height:
-            new_width = max_size[0]
-            new_height = int(new_width / aspect_ratio)
-        else:
-            new_height = max_size[1]
-            new_width = int(new_height * aspect_ratio)
-
-        img = img.resize((new_width, new_height), Image.LANCZOS)
-        img.save(image_path, optimize=True, quality=80)
-
-
-class PredictClient:
-    def __init__(self, api_url):
-        self.api_url = api_url
-
-    def check_health(self):
-        health_check_url = f'{self.api_url}/health'
-        try:
-            response = requests.get(health_check_url)
-            if response.status_code == 200:
-                logger.info("Server is healthy and ready to process requests.")
-                return True
-            else:
-                logger.error(f'Server health check failed with status code:{response.status_code}')
-                return False
-        except requests.exceptions.RequestException as e:
-            logger.error(f'Health check request failed:{e}')
-            return False
-
-
-    def predict(self, image_path: str, text: str):
-        payload = {
-            "image_path": image_path,
-            "text": text
-        }
-        headers = {'Content-Type': 'application/json'}
-        response = requests.post(f"{self.api_url}/predict", json=payload, headers=headers)
-
-        if response.status_code == 200:
-            result = response.json()
-            return result.get('Generated Text', '')
-        else:
-            raise Exception(f"Predict API request failed with status code {response.status_code}")
-
-
-def main():
-    args = parse_args()
-
-    config = configparser.ConfigParser()
-    config.read(args.config_path)
-    ocr_server = config.get('server', 'ocr_server')
-    client = PredictClient(ocr_server)
-    try:
-        start_time = time.time()  # 记录开始时间
-        # 压缩图片
-        #compress_image(args.image_path)
-
-        generated_text = client.predict(args.image_path, parse_text(args.text))
-        end_time = time.time()  # 记录结束时间
-        elapsed_time = end_time - start_time  # 计算运行时间
-
-        if generated_text:
-            clean_text = unparse_text(generated_text)  # 解析生成的文本
-            logger.info(f"Image Path: {args.image_path}")
-            logger.info(f"Generated Text: {clean_text}")
-            logger.info(f"耗时为: {elapsed_time}秒")  # 打印运行时间
-        else:
-            logger.warning("Received empty generated text.")
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Error while making request to predict service: {e}")
-    except Exception as e:
-        logger.error(f"Unexpected error occurred: {e}")
-
-
-if __name__ == "__main__":
-    main()
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
-import configparser
-import re
-import time
-
-import wordninja
-from loguru import logger
-
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
-from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.libs.ocr_content_type import BlockType, ContentType
-# import pypandoc
-# vllm：
-#from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
-
-# 普通 非vllm
-from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
-
-client = None
-status = None
-
-def __is_hyphen_at_line_end(line):
-    """
-    Check if a line ends with one or more letters followed by a hyphen.
-    
-    Args:
-    line (str): The line of text to check.
-    
-    Returns:
-    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
-    """
-    # Use regex to check if the line ends with one or more letters followed by a hyphen
-    return bool(re.search(r'[A-Za-z]+-\s*$', line))
-
-
-def split_long_words(text):
-    segments = text.split(' ')
-    for i in range(len(segments)):
-        words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
-        for j in range(len(words)):
-            if len(words[j]) > 10:
-                words[j] = ' '.join(wordninja.split(words[j]))
-        segments[i] = ''.join(words)
-    return ' '.join(segments)
-
-
-def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
-    markdown = []
-    for page_info in pdf_info_list:
-        paras_of_layout = page_info.get('para_blocks')
-        page_markdown = ocr_mk_markdown_with_para_core_v2(
-            paras_of_layout, 'mm', img_buket_path)
-        markdown.extend(page_markdown)
-    return '\n\n'.join(markdown)
-
-
-def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
-    markdown = []
-    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get('para_blocks')
-        page_markdown = ocr_mk_markdown_with_para_core_v2(
-            paras_of_layout, 'nlp')
-        markdown.extend(page_markdown)
-    return '\n\n'.join(markdown)
-
-
-def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
-                                                img_buket_path):
-    markdown_with_para_and_pagination = []
-    page_no = 0
-    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get('para_blocks')
-        if not paras_of_layout:
-            continue
-        page_markdown = ocr_mk_markdown_with_para_core_v2(
-            paras_of_layout, 'mm', img_buket_path)
-        markdown_with_para_and_pagination.append({
-            'page_no':
-            page_no,
-            'md_content':
-            '\n\n'.join(page_markdown)
-        })
-        page_no += 1
-    return markdown_with_para_and_pagination
-
-
-def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
-    page_markdown = []
-    for paras in paras_of_layout:
-        for para in paras:
-            para_text = ''
-            for line in para:
-                for span in line['spans']:
-                    span_type = span.get('type')
-                    content = ''
-                    language = ''
-                    if span_type == ContentType.Text:
-                        content = span['content']
-                        language = detect_lang(content)
-                        if (language == 'en'):  # 只对英文长词进行分词处理，中文分词会丢失文本
-                            content = ocr_escape_special_markdown_char(
-                                split_long_words(content))
-                        else:
-                            content = ocr_escape_special_markdown_char(content)
-                    elif span_type == ContentType.InlineEquation:
-                        content = f"${span['content']}$"
-                        #content = pypandoc.convert_text(content, to='plain', format='latex')
-                    elif span_type == ContentType.InterlineEquation:
-                        content = f"\n$$\n{span['content']}\n$$\n"
-                        #content = pypandoc.convert_text(content, to='plain', format='latex')
-                    elif span_type in [ContentType.Image, ContentType.Table]:
-                        if mode == 'mm':
-                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
-                        elif mode == 'nlp':
-                            pass
-                    if content != '':
-                        if language == 'en':  # 英文语境下 content间需要空格分隔
-                            para_text += content + ' '
-                        else:  # 中文语境下，content间不需要空格分隔
-                            para_text += content
-            if para_text.strip() == '':
-                continue
-            else:
-                page_markdown.append(para_text.strip() + '  ')
-    return page_markdown
-
-
-def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
-                                      mode,
-                                      img_buket_path=''):
-    page_markdown = []
-
-
-    for para_block in paras_of_layout:
-        para_text = ''
-        para_type = para_block['type']
-        if para_type == BlockType.Text:
-            para_text = str(merge_para_with_text(para_block)).strip()
-        # 处理标题
-        elif para_type == BlockType.Title:
-            para_text = f'{merge_para_with_text(para_block)}'
-        elif para_type == BlockType.InterlineEquation:
-            para_text = merge_para_with_text(para_block)
-        elif para_type == BlockType.Image:
-            if mode == 'nlp':
-                continue
-            elif mode == 'mm':
-                for block in para_block['blocks']:  # 1st.拼image_body
-                    if block['type'] == BlockType.ImageBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Image:
-                                    para_text += f"\n----------------这是ocr图片内容({join_path(img_buket_path, span['image_path'])})------------------  \n"
-
-                for block in para_block['blocks']:  # 2nd.拼image_caption
-                    if block['type'] == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(block)
-                for block in para_block['blocks']:  # 2nd.拼image_caption
-                    if block['type'] == BlockType.ImageFootnote:
-                        para_text += merge_para_with_text(block)
-        # 表格类型
-        elif para_type == BlockType.Table:
-            if mode == 'nlp':
-                continue
-            elif mode == 'mm':
-                for block in para_block['blocks']:  # 1st.拼table_caption
-                    if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block)
-                for block in para_block['blocks']:  # 2nd.拼table_body
-                    if block['type'] == BlockType.TableBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Table:
-                                    # if processed by table model
-                                    if span.get('latex', ''):
-                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
-                                    elif span.get('html', ''):
-                                        para_text += f"\n\n{span['html']}\n\n"
-                                    else:
-                                        para_text += span['image_path']
-                                        # # 处理图片
-                                        # # para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})-------------------  \n"
-                                        # if status:
-                                        #     # text = '解析图片内容，直接返回一段带有逻辑性的中文书面语描述，要求表达精准，不脱离图片中的实际内容，不要带换行,文中所有的名词不要用指代词'
-                                        #     # start = time.time()
-                                        #     # image_path = join_path(img_buket_path, span['image_path'])
-                                        #     # compress_image(image_path)
-                                        #     # generated_text = client.predict(image_path, text)
-                                        #     # end = time.time()
-                                        #     # logger.info(f'qwen解析{image_path}表格的内容为：{generated_text},耗时为：{end-start}')
-                                        #     para_text += span['image_path']
-                                        # else:
-                                        #     para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])})，请检查qwen ocr服务，重新运行文件解析-------------------  \n"
-                for block in para_block['blocks']:  # 3rd.拼table_footnote
-                    if block['type'] == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(block)
-
-        if para_text.strip() == '':
-            continue
-        else:
-            page_markdown.append(para_text.strip() + '  ')
-
-    return page_markdown
-
-
-def merge_para_with_text(para_block):
-
-    def detect_language(text):
-        en_pattern = r'[a-zA-Z]+'
-        en_matches = re.findall(en_pattern, text)
-        en_length = sum(len(match) for match in en_matches)
-        if len(text) > 0:
-            if en_length / len(text) >= 0.5:
-                return 'en'
-            else:
-                return 'unknown'
-        else:
-            return 'empty'
-
-    para_text = ''
-    for line in para_block['lines']:
-        line_text = ''
-        line_lang = ''
-        for span in line['spans']:
-            span_type = span['type']
-            if span_type == ContentType.Text:
-                line_text += span['content'].strip()
-        if line_text != '':
-            line_lang = detect_lang(line_text)
-        for span in line['spans']:
-            span_type = span['type']
-            content = ''
-            if span_type == ContentType.Text:
-                content = span['content']
-                # language = detect_lang(content)
-                language = detect_language(content)
-                if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                    content = ocr_escape_special_markdown_char(
-                        split_long_words(content))
-                else:
-                    content = ocr_escape_special_markdown_char(content)
-            elif span_type == ContentType.InlineEquation:
-                content = f" ${span['content']}$ "
-                #content = pypandoc.convert_text(content, to='plain', format='latex')
-            elif span_type == ContentType.InterlineEquation:
-                content = f"\n$$\n{span['content']}\n$$\n"
-                #content = pypandoc.convert_text(content, to='plain', format='latex')
-
-            if content != '':
-                langs = ['zh', 'ja', 'ko']
-                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                elif line_lang == 'en':
-                    # 如果是前一行带有-连字符，那么末尾不应该加空格
-                    if __is_hyphen_at_line_end(content):
-                        para_text += content[:-1]
-                    else:
-                        para_text += content + ' '
-                else:
-                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
-    return para_text
-
-
-def para_to_standard_format(para, img_buket_path):
-    para_content = {}
-    if len(para) == 1:
-        para_content = line_to_standard_format(para[0], img_buket_path)
-    elif len(para) > 1:
-        para_text = ''
-        inline_equation_num = 0
-        for line in para:
-            for span in line['spans']:
-                language = ''
-                span_type = span.get('type')
-                content = ''
-                if span_type == ContentType.Text:
-                    content = span['content']
-                    language = detect_lang(content)
-                    if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                        content = ocr_escape_special_markdown_char(
-                            split_long_words(content))
-                    else:
-                        content = ocr_escape_special_markdown_char(content)
-                elif span_type == ContentType.InlineEquation:
-                    content = f"${span['content']}$"
-                    #content = pypandoc.convert_text(content, to='plain', format='latex')
-
-                    inline_equation_num += 1
-                if language == 'en':  # 英文语境下 content间需要空格分隔
-                    para_text += content + ' '
-                else:  # 中文语境下，content间不需要空格分隔
-                    para_text += content
-        para_content = {
-            'type': 'text',
-            'text': para_text,
-            'inline_equation_num': inline_equation_num,
-        }
-    return para_content
-
-
-def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
-    para_type = para_block['type']
-    if para_type == BlockType.Text:
-        para_content = {
-            'type': 'text',
-            'text': merge_para_with_text(para_block),
-            'page_idx': page_idx,
-        }
-    elif para_type == BlockType.Title:
-        para_content = {
-            'type': 'text',
-            'text': merge_para_with_text(para_block),
-            'text_level': 1,
-            'page_idx': page_idx,
-        }
-    elif para_type == BlockType.InterlineEquation:
-        para_content = {
-            'type': 'equation',
-            'text': merge_para_with_text(para_block),
-            'text_format': 'latex',
-            'page_idx': page_idx,
-        }
-    elif para_type == BlockType.Image:
-        para_content = {'type': 'image', 'page_idx': page_idx}
-        for block in para_block['blocks']:
-            if block['type'] == BlockType.ImageBody:
-                para_content['img_path'] = join_path(
-                    img_buket_path,
-                    block['lines'][0]['spans'][0]['image_path'])
-            if block['type'] == BlockType.ImageCaption:
-                para_content['img_caption'] = merge_para_with_text(block)
-            if block['type'] == BlockType.ImageFootnote:
-                para_content['img_footnote'] = merge_para_with_text(block)
-    elif para_type == BlockType.Table:
-        para_content = {'type': 'table', 'page_idx': page_idx}
-        for block in para_block['blocks']:
-            if block['type'] == BlockType.TableBody:
-                if block["lines"][0]["spans"][0].get('latex', ''):
-                    para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
-                elif block["lines"][0]["spans"][0].get('html', ''):
-                    para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
-                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
-            if block['type'] == BlockType.TableCaption:
-                para_content['table_caption'] = merge_para_with_text(block)
-            if block['type'] == BlockType.TableFootnote:
-                para_content['table_footnote'] = merge_para_with_text(block)
-
-    return para_content
-
-
-def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
-    content_list = []
-    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get('para_blocks')
-        if not paras_of_layout:
-            continue
-        for para_block in paras_of_layout:
-            para_content = para_to_standard_format_v2(para_block,
-                                                      img_buket_path)
-            content_list.append(para_content)
-    return content_list
-
-
-def line_to_standard_format(line, img_buket_path):
-    line_text = ''
-    inline_equation_num = 0
-    for span in line['spans']:
-        if not span.get('content'):
-            if not span.get('image_path'):
-                continue
-            else:
-                if span['type'] == ContentType.Image:
-                    content = {
-                        'type': 'image',
-                        'img_path': join_path(img_buket_path,
-                                              span['image_path']),
-                    }
-                    return content
-                elif span['type'] == ContentType.Table:
-                    content = {
-                        'type': 'table',
-                        'img_path': join_path(img_buket_path,
-                                              span['image_path']),
-                    }
-                    return content
-        else:
-            if span['type'] == ContentType.InterlineEquation:
-                interline_equation = span['content']
-                content = {
-                    'type': 'equation',
-                    'latex': f'$$\n{interline_equation}\n$$'
-                }
-                return content
-            elif span['type'] == ContentType.InlineEquation:
-                inline_equation = span['content']
-                line_text += f'${inline_equation}$'
-                inline_equation_num += 1
-            elif span['type'] == ContentType.Text:
-                text_content = ocr_escape_special_markdown_char(
-                    span['content'])  # 转义特殊符号
-                line_text += text_content
-    content = {
-        'type': 'text',
-        'text': line_text,
-        'inline_equation_num': inline_equation_num,
-    }
-    return content
-
-
-def ocr_mk_mm_standard_format(pdf_info_dict: list):
-    """content_list type         string
-    image/text/table/equation(行间的单独拿出来，行内的和text合并) latex        string
-    latex文本字段。 text         string      纯文本格式的文本数据。 md           string
-    markdown格式的文本数据。 img_path     string      s3://full/path/to/img.jpg."""
-    content_list = []
-    for page_info in pdf_info_dict:
-        blocks = page_info.get('preproc_blocks')
-        if not blocks:
-            continue
-        for block in blocks:
-            for line in block['lines']:
-                content = line_to_standard_format(line)
-                content_list.append(content)
-    return content_list
-
-
-def union_make(ocr_status:str,
-               config_path: str,
-               pdf_info_dict: list,
-               make_mode: str,
-               drop_mode: str,
-               img_buket_path: str = ''):
-    output_content = []
-    # global client
-    # global status
-    # config = configparser.ConfigParser()
-    # config.read(config_path)
-    # url = config.get('server', 'ocr_server')
-    # logger.info(f'ocr_server：{url}')
-    # # client = PredictClient(url)
-    # status = ocr_status
-
-    for page_info in pdf_info_dict:
-        if page_info.get('need_drop', False):
-            drop_reason = page_info.get('drop_reason')
-            if drop_mode == DropMode.NONE:
-                pass
-            elif drop_mode == DropMode.WHOLE_PDF:
-                raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
-                                 f'drop_reason is {drop_reason}'))
-            elif drop_mode == DropMode.SINGLE_PAGE:
-                logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
-                                f'drop_reason is {drop_reason}'))
-                continue
-            else:
-                raise Exception('drop_mode can not be null')
-
-        paras_of_layout = page_info.get('para_blocks')
-        #logger.info(f'paras_of_layout:\n{paras_of_layout}')
-        page_idx = page_info.get('page_idx')
-        if not paras_of_layout:
-            continue
-        if make_mode == MakeMode.MM_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(
-                paras_of_layout, 'mm', img_buket_path)
-            output_content.extend(page_markdown)
-        elif make_mode == MakeMode.NLP_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(
-                paras_of_layout, 'nlp')
-            output_content.extend(page_markdown)
-        elif make_mode == MakeMode.STANDARD_FORMAT:
-            for para_block in paras_of_layout:
-                para_content = para_to_standard_format_v2(
-                    para_block, img_buket_path, page_idx)
-                output_content.append(para_content)
-    if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
-        return '\n\n'.join(output_content)
-    elif make_mode == MakeMode.STANDARD_FORMAT:
-        return output_content
-
-
--- a/magic_pdf/dict2md/ocr_server.py
+++ b/magic_pdf/dict2md/ocr_server.py
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree
-import configparser
-import copy
-import re
-import gc
-import time
-
-import torch
-from argparse import ArgumentParser
-from threading import Thread
-from qwen_vl_utils import process_vision_info
-from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, TextIteratorStreamer
-
-from fastapi import FastAPI
-from pydantic import BaseModel
-from typing import Optional
-from loguru import logger
-
-app = FastAPI()
-
-DEFAULT_CKPT_PATH = '/home/practice/model/Qwen2-VL-7B-Instruct'
-REVISION = 'v1.0.4'
-BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
-PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
-
-logger.add("parse.log", rotation="10 MB", level="INFO",
-           format="{time} {level} {message}", encoding='utf-8', enqueue=True)
-
-def _get_args():
-    parser = ArgumentParser()
-
-    parser.add_argument('-c', '--checkpoint_path', type=str, default=DEFAULT_CKPT_PATH,
-                        help='Checkpoint name or path, default to %(default)r')
-    parser.add_argument('--cpu_only', action='store_true', help='Run demo with CPU only')
-    parser.add_argument('--flash_attn2', action='store_true', default=False,
-                        help='Enable flash_attention_2 when loading the model.')
-    parser.add_argument('--share', action='store_true', default=False,
-                        help='Create a publicly shareable link for the interface.')
-    parser.add_argument('--inbrowser', action='store_true', default=False,
-                        help='Automatically launch the interface in a new tab on the default browser.')
-    parser.add_argument('--dcu_id', type=str, default='0', help='Specify the GPU ID to load the model onto.')
-    parser.add_argument(
-        '--config_path',
-        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
-        )
-    args = parser.parse_args()
-    return args
-
-
-def _load_model_processor(args):
-    if args.cpu_only:
-        device_map = 'cpu'
-    else:
-        if args.dcu_id is not None:
-            device_map = {'': f'cuda:{args.dcu_id}'}
-            print('使用DCU推理:', f'cuda:{args.dcu_id}')
-        else:
-            device_map = 'auto'
-
-    if args.flash_attn2:
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            args.checkpoint_path,
-            torch_dtype=torch.float16,
-            attn_implementation='flash_attention_2',
-            device_map=device_map
-        )
-    else:
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            args.checkpoint_path,
-            torch_dtype=torch.float16,
-            device_map=device_map
-        )
-
-    processor = AutoProcessor.from_pretrained(args.checkpoint_path)
-    return model, processor
-
-
-def _parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line.strip() != ""]  # 去除空行
-    count = 0
-    parsed_lines = []
-
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                # 开始代码块
-                parsed_lines.append(f'<pre><code class="language-{items[-1]}">')
-            else:
-                # 结束代码块
-                parsed_lines.append(f"</code></pre>")
-        else:
-            if i > 0 and count % 2 == 1:
-                # 转义代码块内的特殊字符
-                line = line.replace("`", r"\`")
-                line = line.replace("<", "&lt;")
-                line = line.replace(">", "&gt;")
-                line = line.replace(" ", "&nbsp;")
-                line = line.replace("*", "&ast;")
-                line = line.replace("_", "&lowbar;")
-                line = line.replace("-", "&#45;")
-                line = line.replace(".", "&#46;")
-                line = line.replace("!", "&#33;")
-                line = line.replace("(", "&#40;")
-                line = line.replace(")", "&#41;")
-                line = line.replace("$", "&#36;")
-            # 使用空格连接行
-            if parsed_lines:
-                parsed_lines[-1] += " " + line
-            else:
-                parsed_lines.append(line)
-
-    text = "".join(parsed_lines)
-    return text
-
-
-def _remove_image_special(text):
-    text = text.replace('<ref>', '').replace('</ref>', '')
-    return re.sub(r'<box>.*?(</box>|$)', '', text)
-
-
-def _is_video_file(filename):
-    video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
-    return any(filename.lower().endswith(ext) for ext in video_extensions)
-
-
-def _transform_messages(original_messages):
-    transformed_messages = []
-    for message in original_messages:
-        new_content = []
-        for item in message['content']:
-            if 'image' in item:
-                new_item = {'type': 'image', 'image': item['image']}
-            elif 'text' in item:
-                new_item = {'type': 'text', 'text': item['text']}
-            elif 'video' in item:
-                new_item = {'type': 'video', 'video': item['video']}
-            else:
-                continue
-            new_content.append(new_item)
-
-        new_message = {'role': message['role'], 'content': new_content}
-        transformed_messages.append(new_message)
-
-    return transformed_messages
-
-
-def _gc():
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-
-
-def call_local_model(model, processor, messages):
-    messages = _transform_messages(messages)
-
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt')
-    inputs = inputs.to(model.device)
-
-    tokenizer = processor.tokenizer
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-
-    gen_kwargs = {'max_new_tokens': 512, 'streamer': streamer, **inputs}
-
-    thread = Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
-
-    generated_text = ''
-    for new_text in streamer:
-        generated_text += new_text
-        yield _parse_text(generated_text)
-
-
-def create_predict_fn(model, processor):
-    def predict(_chatbot, task_history):
-        chat_query = _chatbot[-1][0]
-        query = task_history[-1][0]
-        if len(chat_query) == 0:
-            _chatbot.pop()
-            task_history.pop()
-            return _chatbot
-        print('User: ' + _parse_text(query))
-        history_cp = copy.deepcopy(task_history)
-        full_response = ''
-        messages = []
-        content = []
-        for q, a in history_cp:
-            if isinstance(q, (tuple, list)):
-                if _is_video_file(q[0]):
-                    content.append({'video': f'file://{q[0]}'})
-                else:
-                    content.append({'image': f'file://{q[0]}'})
-            else:
-                content.append({'text': q})
-                messages.append({'role': 'user', 'content': content})
-                messages.append({'role': 'assistant', 'content': [{'text': a}]})
-                content = []
-        messages.pop()
-
-        for response in call_local_model(model, processor, messages):
-            _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
-
-            yield _chatbot
-            full_response = _parse_text(response)
-
-        task_history[-1] = (query, full_response)
-        print('Qwen-VL-Chat: ' + _parse_text(full_response))
-        yield _chatbot
-
-    return predict
-
-
-# 启用加载模型
-args = _get_args()
-model, processor = _load_model_processor(args)
-
-
-class Item(BaseModel):
-    image_path: str
-    text: str
-
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy"}
-
-
-@app.post("/predict")
-async def predict(item: Item):
-    messages = [
-        {
-            'role': 'user',
-            'content': [
-                {'image': item.image_path},
-                {'text': item.text}
-            ]
-        }
-    ]
-    start = time.time()
-    generated_text = ''
-    for response in call_local_model(model, processor, messages):
-        generated_text = _parse_text(response)
-
-    _gc()
-    end = time.time()
-    logger.info(f'【{item.image_path}】解析的结果是：{generated_text},耗时为：{end-start}')
-    return {"Generated Text": generated_text}
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    args = _get_args()
-    config = configparser.ConfigParser()
-    config.read(args.config_path)
-    # host = config.get('server', 'ocr_host')
-    host, port = config.get('server', 'ocr_server').split('://')[1].split(':')[0], int(
-        config.get('server', 'ocr_server').split('://')[1].split(':')[1])
-    # port = int(config.get('server', 'ocr_port'))
-    uvicorn.run(app, host=host, port=port)
-
-
-
--- a/magic_pdf/filter/__init__.py
+++ b/magic_pdf/filter/__init__.py
--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
-"""
-根据利用meta_scan得到的结果，对pdf是否为文字版进行分类。
-定义标准：
-一、什么pdf会是文字pdf，只要满足以下任意一条
-  1. 随机抽取N页，如果有任何一页文字数目大于100
-  2. 只要存在一个页面，图片的数量为0
-二、什么是扫描版pdf，只要满足以下任意一条
-  1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~
-  2. 大部分页面上文字的长度都是相等的。
-
-"""
-import json
-import sys
-from collections import Counter
-
-import click
-import numpy as np
-from loguru import logger
-
-from magic_pdf.libs.commons import mymax, get_top_percent_list
-from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
-
-TEXT_LEN_THRESHOLD = 100
-AVG_TEXT_LEN_THRESHOLD = 100
-TEXT_LEN_SAMPLE_RATIO = 0.1  # 抽取0.1的页面进行文字长度统计
-
-
-# 一个拼接图片的方案，将某些特殊扫描版本的拆图拼成一张整图
-def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
-    # 先通过set去除所有bbox重叠的图片数据
-    image_list_result = []
-    for page_images in image_list:
-        page_result = []
-        dedup = set()
-        for img in page_images:
-            x0, y0, x1, y1, img_bojid = img
-            if (x0, y0, x1, y1) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
-                continue
-            else:
-                dedup.add((x0, y0, x1, y1))
-                page_result.append([x0, y0, x1, y1, img_bojid])
-        image_list_result.append(page_result)
-
-    # 接下来，将同一页可拼接的图片进行合并
-    merged_images = []
-    for page_images in image_list_result:
-        if not page_images:
-            continue
-
-        # 先将同一页的图片从上到下，从左到右进行排序
-        page_images.sort(key=lambda img: (img[1], img[0]))
-
-        merged = [page_images[0]]
-
-        for img in page_images[1:]:
-            x0, y0, x1, y1, imgid = img
-
-            last_img = merged[-1]
-            last_x0, last_y0, last_x1, last_y1, last_imgid = last_img
-
-            # 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件
-            full_width = abs(x1 - x0) >= page_width * 0.9
-            full_height = abs(y1 - y0) >= page_height * 0.9
-
-            # 如果宽达标，检测是否能竖着拼
-            if full_width:
-                # 竖着拼需要满足两个前提，左右边界各偏移不能超过 max_offset，第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
-                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
-                            last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
-
-            # 如果高达标，检测是否可以横着拼
-            if full_height:
-                # 横着拼需要满足两个前提，上下边界各偏移不能超过 max_offset，第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
-                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
-                            last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
-
-            # Check if the image can be merged with the last image
-            if (full_width and close1) or (full_height and close2):
-                # Merge the image with the last image
-                merged[-1] = [min(x0, last_x0), min(y0, last_y0),
-                              max(x1, last_x1), max(y1, last_y1), imgid]
-            else:
-                # Add the image as a new image
-                merged.append(img)
-
-        merged_images.append(merged)
-
-    return merged_images
-
-
-def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text_len_list: list):
-    """
-    80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False，否则返回True
-    :param pdf_path:
-    :param total_page:
-    :param page_width:
-    :param page_height:
-    :param img_sz_list:
-    :return:
-    """
-    # # 只要有一页没有图片，那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf，上面有一些空白页面，既没有图片也没有文字。
-    # if any([len(img_sz) == 0 for img_sz in img_sz_list]):  # 含有不含图片的页面
-    #     # 现在找到这些页面的index
-    #     empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0]
-    #     # 然后检查这些页面上是否有文字
-    #     text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0]
-    #     if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD:  # 没有图片，但是有文字，说明可能是个文字版，如果没有文字则无法判断，留给下一步,现在要求这页文字量超过一定阈值
-    #         return True
-
-    # 通过objid去掉重复出现10次以上的图片，这些图片是隐藏的透明图层，其特点是id都一样
-    # 先对每个id出现的次数做个统计
-    objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
-    # 再去掉出现次数大于10的
-    if total_page >= scan_max_page:  # 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
-        total_page = scan_max_page
-
-    repeat_threshold = 2  # 把bad_image的阈值设为2
-    # repeat_threshold = min(2, total_page)  # 当total_page为1时，repeat_threshold为1，会产生误判导致所有img变成bad_img
-    bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
-    # bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])]
-    # text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0]
-
-    # 特殊情况，一个文字版pdf，每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上
-    # fake_image_ids = [objid for objid in bad_image_objid if
-    #                   any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for
-    #                        x0, y0, x1, y1, _ in images])]  # 原来的代码，any里面恒为true了，原因？？？
-    # fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images
-    #                   if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9]
-
-    # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]):  # 这些透明图片所在的页面上有文字大于阈值
-    #     return True
-
-    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
-                   img_sz_list]  # 过滤掉重复出现的图片
-
-    # 有的扫描版会把一页图片拆成很多张，需要先把图拼起来再计算
-    img_sz_list = merge_images(img_sz_list, page_width, page_height)
-
-    # 计算每个页面上最大的图的面积，然后计算这个面积占页面面积的比例
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
-                               img_sz_list]
-    page_area = page_width * page_height
-    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
-    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
-
-    if len(max_image_area_per_page) >= 0.5 * total_page:  # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
-        # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层，其特点是id都一样
-        return False
-    else:
-        return True
-
-
-def classify_by_text_len(text_len_list: list, total_page: int):
-    """
-    随机抽取10%的页面，如果少于5个页面，那么就取全部页面。
-    查看页面上的文字长度，如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD，那么就是文字pdf
-    :param total_page:
-    :param text_len_list:
-    :return:
-    """
-    select_page_cnt = int(total_page * TEXT_LEN_SAMPLE_RATIO)  # 选取10%的页面
-    if select_page_cnt < 5:
-        select_page_cnt = total_page
-
-    # # 排除头尾各10页
-    # if total_page > 20:  # 如果总页数大于20
-    #     page_range = list(range(10, total_page - 10))  # 从第11页到倒数第11页
-    # else:
-    #     page_range = list(range(total_page))  # 否则选择所有页面
-    # page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False)
-    # 排除前后10页对只有21，22页的pdf很尴尬，如果选出来的中间那一两页恰好没字容易误判，有了avg_words规则，这个规则可以忽略
-    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
-    text_len_lst = [text_len_list[i] for i in page_num]
-    is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
-    return is_text_pdf
-
-
-def classify_by_avg_words(text_len_list: list):
-    """
-    补充规则，如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD，就不是文字pdf
-    主要是各种图集
-    :param text_len_list:
-    :return:
-    """
-    sum_words = sum(text_len_list)
-    count_of_numbers = len(text_len_list)
-    if count_of_numbers == 0:
-        is_text_pdf = False
-    else:
-        avg_words = round(sum_words / count_of_numbers)
-        if avg_words > AVG_TEXT_LEN_THRESHOLD:
-            is_text_pdf = True
-        else:
-            is_text_pdf = False
-
-    return is_text_pdf
-
-
-def classify_by_img_num(img_sz_list: list, img_num_list: list):
-    """
-    补充规则，有一种扫描版本的PDF，每一页都会放所有的扫描页进去，在 metascan 时会被去重，
-    这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素，img_num_list中每一页的数量都很大且相同
-    :param img_sz_list:
-    :param img_num_list:
-    :return:
-    """
-    # 计算img_sz_list中非空元素的个数
-    count_img_sz_list_not_none = sum(1 for item in img_sz_list if item)
-    # 获取前80%的元素
-    top_eighty_percent = get_top_percent_list(img_num_list, 0.8)
-    # img_sz_list中非空元素的个数小于1，前80%的元素都相等，且最大值大于等于junk_limit_min
-    if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
-
-        #拿max和min的值,用来判断list内的值是否全都相等
-        # min_imgs = min(img_num_list)
-        # max_imgs = max(img_num_list)
-        #
-        # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
-        return False  # 如果满足这个条件，一定不是文字版pdf
-    else:
-        return True  # 不满足这三个条件，可能是文字版pdf，通过其他规则判断
-
-
-def classify_by_text_layout(text_layout_per_page: list):
-    """
-    判断文本布局是否以竖排为主。
-
-    Args:
-        text_layout_per_page (list): 文本布局列表，列表中的每个元素表示一页的文本布局，
-                                     值为'vertical'表示竖排，值为'horizontal'表示横排。
-
-    Returns:
-        bool: 若文本布局以竖排为主，则返回False；否则返回True。
-    """
-    # 统计text_layout_per_page中竖排的个数
-    count_vertical = sum(1 for item in text_layout_per_page if item == 'vertical')
-    # 统计text_layout_per_page中横排的个数
-    count_horizontal = sum(1 for item in text_layout_per_page if item == 'horizontal')
-    # 计算text_layout_per_page中竖排的占比
-    known_layout_cnt = count_vertical + count_horizontal
-    if known_layout_cnt != 0:
-        ratio = count_vertical / known_layout_cnt
-        if ratio >= 0.5:  # 阈值设为0.5，适配3页里面有2页和两页里有一页的情况
-            return False  # 文本布局以竖排为主，认为不是文字版pdf
-        else:
-            return True  # 文本布局以横排为主，认为是文字版pdf
-    else:
-        return False  # 文本布局未知，默认认为不是文字版pdf
-
-
-def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
-    """
-    判断一页是否由细长条组成，有两个条件：
-    1. 图片的宽或高达到页面宽或高的90%，且长边需要是窄边长度的数倍以上
-    2. 整个页面所有的图片有80%以上满足条件1
-
-    Args:
-        page_width (float): 页面宽度
-        page_height (float): 页面高度
-        img_sz_list (list): 图片尺寸列表，每个元素为一个元组，表示图片的矩形区域和尺寸，形如(x0, y0, x1, y1, size)，其中(x0, y0)为矩形区域的左上角坐标，(x1, y1)为矩形区域的右下角坐标，size为图片的尺寸
-
-    Returns:
-        bool: 如果满足条件的页面的比例小于0.5，返回True，否则返回False
-    """
-
-    def is_narrow_strip(img):
-        x0, y0, x1, y1, _ = img
-        width, height = x1 - x0, y1 - y0
-        return any([
-            # 图片宽度大于等于页面宽度的90%，且宽度大于等于高度4倍
-            width >= page_width * 0.9 and width >= height * 4,
-            # 图片高度大于等于页面高度的90%，且高度大于等于宽度4倍
-            height >= page_height * 0.9 and height >= width * 4,
-        ])
-
-    # 初始化满足条件的页面数量
-    narrow_strip_pages_count = 0
-
-    # 遍历所有页面
-    for page_img_list in img_sz_list:
-        # 忽略空页面
-        if not page_img_list:
-            continue
-
-        # 计算页面中的图片总数
-        total_images = len(page_img_list)
-
-        # 计算页面中细长条图片的数量
-        narrow_strip_images_count = 0
-        for img in page_img_list:
-            if is_narrow_strip(img):
-                narrow_strip_images_count += 1
-        # 如果细长条图片的数量少于5，跳过
-        if narrow_strip_images_count < 5:
-            continue
-        else:
-            # 如果细长条图片的比例大于或等于0.8，增加满足条件的页面数量
-            if narrow_strip_images_count / total_images >= 0.8:
-                narrow_strip_pages_count += 1
-
-    # 计算满足条件的页面的比例
-    narrow_strip_pages_ratio = narrow_strip_pages_count / len(img_sz_list)
-
-    return narrow_strip_pages_ratio < 0.5
-
-
-def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
-             text_layout_list: list, invalid_chars: bool):
-    """
-    这里的图片和页面长度单位是pts
-    :param total_page:
-    :param text_len_list:
-    :param page_width:
-    :param page_height:
-    :param img_sz_list:
-    :param pdf_path:
-    :return:
-    """
-    results = {
-        'by_image_area': classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list),
-        'by_text_len': classify_by_text_len(text_len_list, total_page),
-        'by_avg_words': classify_by_avg_words(text_len_list),
-        'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
-        'by_text_layout': classify_by_text_layout(text_layout_list),
-        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
-        'by_invalid_chars': invalid_chars,
-    }
-
-    if all(results.values()):
-        return True, results
-    elif not any(results.values()):
-        return False, results
-    else:
-        logger.warning(
-            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
-            f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
-            f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
-            f" by_invalid_chars: {results['by_invalid_chars']}",
-            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
-        return False, results
-
-
-@click.command()
-@click.option("--json-file", type=str, help="pdf信息")
-def main(json_file):
-    if json_file is None:
-        print("json_file is None", file=sys.stderr)
-        exit(0)
-    try:
-        with open(json_file, "r") as f:
-            for l in f:
-                if l.strip() == "":
-                    continue
-                o = json.loads(l)
-                total_page = o["total_page"]
-                page_width = o["page_width_pts"]
-                page_height = o["page_height_pts"]
-                img_sz_list = o["image_info_per_page"]
-                text_len_list = o['text_len_per_page']
-                text_layout_list = o['text_layout_per_page']
-                pdf_path = o['pdf_path']
-                is_encrypted = o['is_encrypted']
-                is_needs_password = o['is_needs_password']
-                if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-                    continue
-                tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
-                o['is_text_pdf'] = tag
-                print(json.dumps(o, ensure_ascii=False))
-    except Exception as e:
-        print("ERROR: ", e, file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
-    # false = False
-    # true = True
-    # null = None
-    # o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}}
-    # o = json.loads(json.dumps(o))
-    # total_page = o["total_page"]
-    # page_width = o["page_width_pts"]
-    # page_height = o["page_height_pts"]
-    # img_sz_list = o["image_info_per_page"]
-    # text_len_list = o['text_len_per_page']
-    # pdf_path = o['pdf_path']
-    # is_encrypted = o['is_encrypted']
-    # is_needs_password = o['is_needs_password']
-    # if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
-    #     print("加密的")
-    #     exit(0)
-    # tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list)
-    # o['is_text_pdf'] = tag
-    # print(json.dumps(o, ensure_ascii=False))
--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
-"""
-输入： s3路径，每行一个
-输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置
-"""
-import sys
-import click
-
-from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
-from magic_pdf.libs.commons import fitz
-from loguru import logger
-from collections import Counter
-
-from magic_pdf.libs.drop_reason import DropReason
-from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars
-
-scan_max_page = 50
-junk_limit_min = 10
-
-
-def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
-                               result]
-    page_area = int(page_width_pts) * int(page_height_pts)
-    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
-    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
-    return max_image_area_per_page
-
-
-def process_image(page, junk_img_bojids=[]):
-    page_result = []  # 存每个页面里的多张图四元组信息
-    items = page.get_images()
-    dedup = set()
-    for img in items:
-        # 这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
-        img_bojid = img[0]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
-        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
-            continue
-        recs = page.get_image_rects(img, transform=True)
-        if recs:
-            rec = recs[0][0]
-            x0, y0, x1, y1 = map(int, rec)
-            width = x1 - x0
-            height = y1 - y0
-            if (x0, y0, x1, y1, img_bojid) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
-                continue
-            if not all([width, height]):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
-                continue
-            dedup.add((x0, y0, x1, y1, img_bojid))
-            page_result.append([x0, y0, x1, y1, img_bojid])
-    return page_result
-
-
-def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
-    """
-    返回每个页面里的图片的四元组，每个页面多个图片。
-    :param doc:
-    :return:
-    """
-    # 使用 Counter 计数 img_bojid 的出现次数
-    img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
-    # 找出出现次数超过 len(doc) 半数的 img_bojid
-
-    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
-
-    junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
-
-    #todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
-    #有两种扫描版，一种文字版，这里可能会有误判
-    #扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
-    #扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
-    #文字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
-    imgs_len_list = [len(page.get_images()) for page in doc]
-
-    special_limit_pages = 10
-
-    # 统一用前十页结果做判断
-    result = []
-    break_loop = False
-    for i, page in enumerate(doc):
-        if break_loop:
-            break
-        if i >= special_limit_pages:
-            break
-        page_result = process_image(page)  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
-        result.append(page_result)
-        for item in result:
-            if not any(item):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
-                if max(imgs_len_list) == min(imgs_len_list) and max(
-                        imgs_len_list) >= junk_limit_min:  # 如果是特殊文字版，就把junklist置空并break
-                    junk_img_bojids = []
-                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
-                    pass
-                break_loop = True
-                break
-    if not break_loop:
-        # 获取前80%的元素
-        top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
-        # 检查前80%的元素是否都相等
-        if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
-
-            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
-            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
-
-            #前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
-            max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
-            if len(max_image_area_per_page) < 0.8 * special_limit_pages:  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
-                junk_img_bojids = []
-            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
-                pass
-        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
-            junk_img_bojids = []
-
-    #正式进入取前50页图片的信息流程
-    result = []
-    for i, page in enumerate(doc):
-        if i >= scan_max_page:
-            break
-        page_result = process_image(page, junk_img_bojids)
-        # logger.info(f"page {i} img_len: {len(page_result)}")
-        result.append(page_result)
-
-    return result, junk_img_bojids
-
-
-def get_pdf_page_size_pts(doc: fitz.Document):
-    page_cnt = len(doc)
-    l: int = min(page_cnt, 50)
-    #把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
-    page_width_list = []
-    page_height_list = []
-    for i in range(l):
-        page = doc[i]
-        page_rect = page.rect
-        page_width_list.append(page_rect.width)
-        page_height_list.append(page_rect.height)
-
-    page_width_list.sort()
-    page_height_list.sort()
-
-    median_width = page_width_list[len(page_width_list) // 2]
-    median_height = page_height_list[len(page_height_list) // 2]
-
-    return median_width, median_height
-
-
-def get_pdf_textlen_per_page(doc: fitz.Document):
-    text_len_lst = []
-    for page in doc:
-        # 拿包含img和text的所有blocks
-        # text_block = page.get_text("blocks")
-        # 拿所有text的blocks
-        # text_block = page.get_text("words")
-        # text_block_len = sum([len(t[4]) for t in text_block])
-        #拿所有text的str
-        text_block = page.get_text("text")
-        text_block_len = len(text_block)
-        # logger.info(f"page {page.number} text_block_len: {text_block_len}")
-        text_len_lst.append(text_block_len)
-
-    return text_len_lst
-
-
-def get_pdf_text_layout_per_page(doc: fitz.Document):
-    """
-    根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
-
-    Args:
-        doc (fitz.Document): PDF文档对象。
-
-    Returns:
-        List[str]: 每一页的文本布局（横向、纵向、未知）。
-
-    """
-    text_layout_list = []
-
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 创建每一页的纵向和横向的文本行数计数器
-        vertical_count = 0
-        horizontal_count = 0
-        text_dict = page.get_text("dict")
-        if "blocks" in text_dict:
-            for block in text_dict["blocks"]:
-                if 'lines' in block:
-                    for line in block["lines"]:
-                        # 获取line的bbox顶点坐标
-                        x0, y0, x1, y1 = line['bbox']
-                        # 计算bbox的宽高
-                        width = x1 - x0
-                        height = y1 - y0
-                        # 计算bbox的面积
-                        area = width * height
-                        font_sizes = []
-                        for span in line['spans']:
-                            if 'size' in span:
-                                font_sizes.append(span['size'])
-                        if len(font_sizes) > 0:
-                            average_font_size = sum(font_sizes) / len(font_sizes)
-                        else:
-                            average_font_size = 10  # 有的line拿不到font_size，先定一个阈值100
-                        if area <= average_font_size ** 2:  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
-                            continue
-                        else:
-                            if 'wmode' in line:  # 通过wmode判断文本方向
-                                if line['wmode'] == 1:  # 判断是否为竖向文本
-                                    vertical_count += 1
-                                elif line['wmode'] == 0:  # 判断是否为横向文本
-                                    horizontal_count += 1
-                        #     if 'dir' in line:  # 通过旋转角度计算判断文本方向
-                        #         # 获取行的 "dir" 值
-                        #         dir_value = line['dir']
-                        #         cosine, sine = dir_value
-                        #         # 计算角度
-                        #         angle = math.degrees(math.acos(cosine))
-                        #
-                        #         # 判断是否为横向文本
-                        #         if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is horizontal:', line_text)
-                        #             horizontal_count += 1
-                        #         # 判断是否为纵向文本
-                        #         elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
-                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
-                        #             # print('This line is vertical:', line_text)
-                        #             vertical_count += 1
-        # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-        # 判断每一页的文本布局
-        if vertical_count == 0 and horizontal_count == 0:  # 该页没有文本，无法判断
-            text_layout_list.append("unknow")
-            continue
-        else:
-            if vertical_count > horizontal_count:  # 该页的文本纵向行数大于横向的
-                text_layout_list.append("vertical")
-            else:  # 该页的文本横向行数大于纵向的
-                text_layout_list.append("horizontal")
-        # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
-    return text_layout_list
-
-
-'''定义一个自定义异常用来抛出单页svg太多的pdf'''
-
-
-class PageSvgsTooManyError(Exception):
-    def __init__(self, message="Page SVGs are too many"):
-        self.message = message
-        super().__init__(self.message)
-
-
-def get_svgs_per_page(doc: fitz.Document):
-    svgs_len_list = []
-    for page_id, page in enumerate(doc):
-        # svgs = page.get_drawings()
-        svgs = page.get_cdrawings()  # 切换成get_cdrawings，效率更高
-        len_svgs = len(svgs)
-        if len_svgs >= 3000:
-            raise PageSvgsTooManyError()
-        else:
-            svgs_len_list.append(len_svgs)
-        # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
-    return svgs_len_list
-
-
-def get_imgs_per_page(doc: fitz.Document):
-    imgs_len_list = []
-    for page_id, page in enumerate(doc):
-        imgs = page.get_images()
-        imgs_len_list.append(len(imgs))
-        # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
-
-    return imgs_len_list
-
-
-def get_language(doc: fitz.Document):
-    """
-    获取PDF文档的语言。
-    Args:
-        doc (fitz.Document): PDF文档对象。
-    Returns:
-        str: 文档语言，如 "en-US"。
-    """
-    language_lst = []
-    for page_id, page in enumerate(doc):
-        if page_id >= scan_max_page:
-            break
-        # 拿所有text的str
-        text_block = page.get_text("text")
-        page_language = detect_lang(text_block)
-        language_lst.append(page_language)
-
-        # logger.info(f"page_id: {page_id}, page_language: {page_language}")
-
-    # 统计text_language_list中每种语言的个数
-    count_dict = Counter(language_lst)
-    # 输出text_language_list中出现的次数最多的语言
-    language = max(count_dict, key=count_dict.get)
-    return language
-
-
-def check_invalid_chars(pdf_bytes):
-    """
-    乱码检测
-    """
-    return detect_invalid_chars(pdf_bytes)
-
-
-def pdf_meta_scan(pdf_bytes: bytes):
-    """
-    :param s3_pdf_path:
-    :param pdf_bytes: pdf文件的二进制数据
-    几个维度来评价：是否加密，是否需要密码，纸张大小，总页数，是否文字可提取
-    """
-    doc = fitz.open("pdf", pdf_bytes)
-    is_needs_password = doc.needs_pass
-    is_encrypted = doc.is_encrypted
-    total_page = len(doc)
-    if total_page == 0:
-        logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
-        result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
-        return result
-    else:
-        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
-        # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
-
-        # svgs_per_page = get_svgs_per_page(doc)
-        # logger.info(f"svgs_per_page: {svgs_per_page}")
-        imgs_per_page = get_imgs_per_page(doc)
-        # logger.info(f"imgs_per_page: {imgs_per_page}")
-
-        image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
-        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
-        text_len_per_page = get_pdf_textlen_per_page(doc)
-        # logger.info(f"text_len_per_page: {text_len_per_page}")
-        text_layout_per_page = get_pdf_text_layout_per_page(doc)
-        # logger.info(f"text_layout_per_page: {text_layout_per_page}")
-        text_language = get_language(doc)
-        # logger.info(f"text_language: {text_language}")
-        invalid_chars = check_invalid_chars(pdf_bytes)
-        # logger.info(f"invalid_chars: {invalid_chars}")
-
-        # 最后输出一条json
-        res = {
-            "is_needs_password": is_needs_password,
-            "is_encrypted": is_encrypted,
-            "total_page": total_page,
-            "page_width_pts": int(page_width_pts),
-            "page_height_pts": int(page_height_pts),
-            "image_info_per_page": image_info_per_page,
-            "text_len_per_page": text_len_per_page,
-            "text_layout_per_page": text_layout_per_page,
-            "text_language": text_language,
-            # "svgs_per_page": svgs_per_page,
-            "imgs_per_page": imgs_per_page,  # 增加每页img数量list
-            "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
-            "invalid_chars": invalid_chars,
-            "metadata": doc.metadata
-        }
-        # logger.info(json.dumps(res, ensure_ascii=False))
-        return res
-
-
-@click.command()
-@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
-@click.option('--s3-profile', help='s3上的profile')
-def main(s3_pdf_path: str, s3_profile: str):
-    """
-
-    """
-    try:
-        file_content = read_file(s3_pdf_path, s3_profile)
-        pdf_meta_scan(file_content)
-    except Exception as e:
-        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
-        logger.exception(e)
-
-
-if __name__ == '__main__':
-    main()
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
-    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
-    # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
-    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
-    # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
-    # doc = fitz.open("pdf", file_content)
-    # text_layout_lst = get_pdf_text_layout_per_page(doc)
-    # print(text_layout_lst)
--- a/magic_pdf/integrations/__init__.py
+++ b/magic_pdf/integrations/__init__.py
--- a/magic_pdf/integrations/rag/__init__.py
+++ b/magic_pdf/integrations/rag/__init__.py