Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json,...

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json, requirements.txt, requirements-docker.txt, requirements-qa.txt, update_version.py, setup.py, magic_pdf/__init__.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/tmp.py, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/pdf_client.py, magic_pdf/tools/common.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/cli.py, magic_pdf/tools/pdf_server.py files

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json,...
Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json, requirements.txt, requirements-docker.txt, requirements-qa.txt, update_version.py, setup.py, magic_pdf/__init__.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/tmp.py, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/pdf_client.py, magic_pdf/tools/common.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/cli.py, magic_pdf/tools/pdf_server.py files
c9171d1f · zhougaofeng · 748e3b56 · c9171d1f · c9171d1f · c9171d1f
Commit c9171d1f authored Oct 22, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
+import datetime
+import json
+import os, re, configparser
+import subprocess
+import time
+import boto3
+from loguru import logger
+from boto3.s3.transfer import TransferConfig
+from botocore.config import Config
+import fitz # 1.23.9中已经切换到rebase
+# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
+def get_delta_time(input_time):
+    return round(time.time() - input_time, 2)
+def join_path(*args):
+    return '/'.join(str(s).rstrip('/') for s in args)
+#配置全局的errlog_path，方便demo同步引用
+error_log_path = "s3://llm-pdf-text/err_logs/"
+# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
+json_dump_path = "s3://llm-pdf-text/json_dump/"
+# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
+def get_top_percent_list(num_list, percent):
+    """
+    获取列表中前百分之多少的元素
+    :param num_list:
+    :param percent:
+    :return:
+    """
+    if len(num_list) == 0:
+        top_percent_list = []
+    else:
+        # 对imgs_len_list排序
+        sorted_imgs_len_list = sorted(num_list, reverse=True)
+        # 计算 percent 的索引
+        top_percent_index = int(len(sorted_imgs_len_list) * percent)
+        # 取前80%的元素
+        top_percent_list = sorted_imgs_len_list[:top_percent_index]
+    return top_percent_list
+def formatted_time(time_stamp):
+    dt_object = datetime.datetime.fromtimestamp(time_stamp)
+    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
+    return output_time
+def mymax(alist: list):
+    if len(alist) == 0:
+        return 0  # 空是0， 0*0也是0大小q
+    else:
+        return max(alist)
+def parse_aws_param(profile):
+    if isinstance(profile, str):
+        # 解析配置文件
+        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
+        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
+        config = configparser.ConfigParser()
+        config.read(credentials_file)
+        config.read(config_file)
+        # 获取 AWS 账户相关信息
+        ak = config.get(profile, "aws_access_key_id")
+        sk = config.get(profile, "aws_secret_access_key")
+        if profile == "default":
+            s3_str = config.get(f"{profile}", "s3")
+        else:
+            s3_str = config.get(f"profile {profile}", "s3")
+        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if end_match:
+            endpoint = end_match.group(1)
+        else:
+            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
+        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if style_match:
+            addressing_style = style_match.group(1)
+        else:
+            addressing_style = "path"
+    elif isinstance(profile, dict):
+        ak = profile["ak"]
+        sk = profile["sk"]
+        endpoint = profile["endpoint"]
+        addressing_style = "auto"
+    return ak, sk, endpoint, addressing_style
+def parse_bucket_key(s3_full_path: str):
+    """
+    输入 s3://bucket/path/to/my/file.txt
+    输出 bucket, path/to/my/file.txt
+    """
+    s3_full_path = s3_full_path.strip()
+    if s3_full_path.startswith("s3://"):
+        s3_full_path = s3_full_path[5:]
+    if s3_full_path.startswith("/"):
+        s3_full_path = s3_full_path[1:]
+    bucket, key = s3_full_path.split("/", 1)
+    return bucket, key
+def read_file(pdf_path: str, s3_profile):
+    if pdf_path.startswith("s3://"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
+        bucket_name, bucket_key = parse_bucket_key(pdf_path)
+        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
+        file_content = res["Body"].read()
+        return file_content
+    else:
+        with open(pdf_path, "rb") as f:
+            return f.read()
+def get_docx_model_output(pdf_model_output, page_id):
+    model_output_json = pdf_model_output[page_id]
+    return model_output_json
+def list_dir(dir_path:str, s3_profile:str):
+    """
+    列出dir_path下的所有文件
+    """
+    ret = []
+    if dir_path.startswith("s3"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
+        bucket, path = s3info[0][0], s3info[0][1]
+        try:
+            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                                            config=Config(s3={'addressing_style': addressing_style}))
+            def list_obj_scluster():
+                marker = None
+                while True:
+                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
+                    if marker:
+                        list_kwargs['Marker'] = marker
+                    response = cli.list_objects(**list_kwargs)
+                    contents = response.get("Contents", [])
+                    yield from contents
+                    if not response.get("IsTruncated") or len(contents)==0:
+                        break
+                    marker = contents[-1]['Key']
+            for info in list_obj_scluster():
+                file_path = info['Key']
+                #size = info['Size']
+                if path!="":
+                    afile = file_path[len(path):]
+                    if afile.endswith(".json"):
+                        ret.append(f"s3://{bucket}/{file_path}")
+            return ret
+        except Exception as e:
+            logger.exception(e)
+            exit(-1)
+    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
+        for root, dirs, files in os.walk(dir_path):
+            for file in files:
+                if file.endswith(".json"):
+                    ret.append(join_path(root, file))
+        ret.sort()
+        return ret
+def get_img_s3_client(save_path:str, image_s3_config:str):
+    """
+    """
+    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
+        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
+        img_s3_client = boto3.client(
+            service_name="s3",
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=end_point,
+            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
+        )
+    else:
+        img_s3_client = None
+    return img_s3_client
+if __name__=="__main__":
+    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
+    s3_profile = "langchao"
+    ret = list_dir(s3_path, s3_profile)
+    print(ret)
\ No newline at end of file
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
+"""
+根据bucket的名字返回对应的s3 AK， SK，endpoint三元组
+"""
+import json
+import os
+from loguru import logger
+from magic_pdf.libs.commons import parse_bucket_key
+# 定义配置文件名常量
+CONFIG_FILE_NAME = "magic-pdf.json"
+def read_config():
+    home_dir = os.path.expanduser("~")
+    config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
+    if not os.path.exists(config_file):
+        raise FileNotFoundError(f"{config_file} not found")
+    with open(config_file, "r", encoding="utf-8") as f:
+        config = json.load(f)
+    return config
+def get_s3_config(bucket_name: str):
+    """
+    ~/magic-pdf.json 读出来
+    """
+    config = read_config()
+    bucket_info = config.get("bucket_info")
+    if bucket_name not in bucket_info:
+        access_key, secret_key, storage_endpoint = bucket_info["[default]"]
+    else:
+        access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
+    if access_key is None or secret_key is None or storage_endpoint is None:
+        raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
+    # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
+    return access_key, secret_key, storage_endpoint
+def get_s3_config_dict(path: str):
+    access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
+    return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
+def get_bucket_name(path):
+    bucket, key = parse_bucket_key(path)
+    return bucket
+def get_local_models_dir():
+    config = read_config()
+    models_dir = config.get("models-dir")
+    if models_dir is None:
+        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
+        return "/tmp/models"
+    else:
+        return models_dir
+def get_device():
+    config = read_config()
+    device = config.get("device-mode")
+    if device is None:
+        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
+        return "cpu"
+    else:
+        return device
+def get_table_recog_config():
+    config = read_config()
+    table_config = config.get("table-config")
+    if table_config is None:
+        logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
+        return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
+    else:
+        return table_config
+if __name__ == "__main__":
+    ak, sk, endpoint = get_s3_config("llm-raw")
--- a/magic_pdf/libs/convert_utils.py
+++ b/magic_pdf/libs/convert_utils.py
+def dict_to_list(input_dict):
+    items_list = []
+    for _, item in input_dict.items():
+        items_list.append(item)
+    return items_list
--- a/magic_pdf/libs/coordinate_transform.py
+++ b/magic_pdf/libs/coordinate_transform.py
+def get_scale_ratio(model_page_info, page):
+    pix = page.get_pixmap(dpi=72)
+    pymu_width = int(pix.w)
+    pymu_height = int(pix.h)
+    width_from_json = model_page_info['page_info']['width']
+    height_from_json = model_page_info['page_info']['height']
+    horizontal_scale_ratio = width_from_json / pymu_width
+    vertical_scale_ratio = height_from_json / pymu_height
+    return horizontal_scale_ratio, vertical_scale_ratio
--- a/magic_pdf/libs/detect_language_from_model.py
+++ b/magic_pdf/libs/detect_language_from_model.py
+from collections import Counter
+from magic_pdf.libs.language import detect_lang
+def get_language_from_model(model_list: list):
+    language_lst = []
+    for ocr_page_info in model_list:
+        page_text = ""
+        layout_dets = ocr_page_info["layout_dets"]
+        for layout_det in layout_dets:
+            category_id = layout_det["category_id"]
+            allow_category_id_list = [15]
+            if category_id in allow_category_id_list:
+                page_text += layout_det["text"]
+        page_language = detect_lang(page_text)
+        language_lst.append(page_language)
+    # 统计text_language_list中每种语言的个数
+    count_dict = Counter(language_lst)
+    # 输出text_language_list中出现的次数最多的语言
+    language = max(count_dict, key=count_dict.get)
+    return language
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
+from magic_pdf.libs.commons import fitz  # PyMuPDF
+from magic_pdf.libs.Constants import CROSS_PAGE
+from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
+from magic_pdf.model.magic_model import MagicModel
+def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for bbox in page_data:
+        x0, y0, x1, y1 = bbox
+        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+        if fill_config:
+            page.draw_rect(
+                rect_coords,
+                color=None,
+                fill=new_rgb,
+                fill_opacity=0.3,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        else:
+            page.draw_rect(
+                rect_coords,
+                color=new_rgb,
+                fill=None,
+                fill_opacity=1,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for j, bbox in enumerate(page_data):
+        x0, y0, x1, y1 = bbox
+        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+        if fill_config:
+            page.draw_rect(
+                rect_coords,
+                color=None,
+                fill=new_rgb,
+                fill_opacity=0.3,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        else:
+            page.draw_rect(
+                rect_coords,
+                color=new_rgb,
+                fill=None,
+                fill_opacity=1,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        page.insert_text(
+            (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
+        )  # Insert the index in the top left corner of the rectangle
+def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
+    layout_bbox_list = []
+    dropped_bbox_list = []
+    tables_list, tables_body_list = [], []
+    tables_caption_list, tables_footnote_list = [], []
+    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
+    imgs_footnote_list = []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    for page in pdf_info:
+        page_layout_list = []
+        page_dropped_list = []
+        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
+        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
+        titles = []
+        texts = []
+        interequations = []
+        for layout in page['layout_bboxes']:
+            page_layout_list.append(layout['layout_bbox'])
+        layout_bbox_list.append(page_layout_list)
+        for dropped_bbox in page['discarded_blocks']:
+            page_dropped_list.append(dropped_bbox['bbox'])
+        dropped_bbox_list.append(page_dropped_list)
+        for block in page['para_blocks']:
+            bbox = block['bbox']
+            if block['type'] == BlockType.Table:
+                tables.append(bbox)
+                for nested_block in block['blocks']:
+                    bbox = nested_block['bbox']
+                    if nested_block['type'] == BlockType.TableBody:
+                        tables_body.append(bbox)
+                    elif nested_block['type'] == BlockType.TableCaption:
+                        tables_caption.append(bbox)
+                    elif nested_block['type'] == BlockType.TableFootnote:
+                        tables_footnote.append(bbox)
+            elif block['type'] == BlockType.Image:
+                imgs.append(bbox)
+                for nested_block in block['blocks']:
+                    bbox = nested_block['bbox']
+                    if nested_block['type'] == BlockType.ImageBody:
+                        imgs_body.append(bbox)
+                    elif nested_block['type'] == BlockType.ImageCaption:
+                        imgs_caption.append(bbox)
+                    elif nested_block['type'] == BlockType.ImageFootnote:
+                        imgs_footnote.append(bbox)
+            elif block['type'] == BlockType.Title:
+                titles.append(bbox)
+            elif block['type'] == BlockType.Text:
+                texts.append(bbox)
+            elif block['type'] == BlockType.InterlineEquation:
+                interequations.append(bbox)
+        tables_list.append(tables)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_list.append(imgs)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        imgs_footnote_list.append(imgs_footnote)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
+        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
+                                 True)
+        draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
+                                 True)  # color !
+        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
+                                 True)
+        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
+                                 True)
+        draw_bbox_without_number(i, tables_footnote_list, page,
+                                 [229, 255, 204], True)
+        draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
+        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
+                                 True)
+        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
+                              True),
+        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
+                                 True)
+    # Save the PDF
+    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
+def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
+    text_list = []
+    inline_equation_list = []
+    interline_equation_list = []
+    image_list = []
+    table_list = []
+    dropped_list = []
+    next_page_text_list = []
+    next_page_inline_equation_list = []
+    def get_span_info(span):
+        if span['type'] == ContentType.Text:
+            if span.get(CROSS_PAGE, False):
+                next_page_text_list.append(span['bbox'])
+            else:
+                page_text_list.append(span['bbox'])
+        elif span['type'] == ContentType.InlineEquation:
+            if span.get(CROSS_PAGE, False):
+                next_page_inline_equation_list.append(span['bbox'])
+            else:
+                page_inline_equation_list.append(span['bbox'])
+        elif span['type'] == ContentType.InterlineEquation:
+            page_interline_equation_list.append(span['bbox'])
+        elif span['type'] == ContentType.Image:
+            page_image_list.append(span['bbox'])
+        elif span['type'] == ContentType.Table:
+            page_table_list.append(span['bbox'])
+    for page in pdf_info:
+        page_text_list = []
+        page_inline_equation_list = []
+        page_interline_equation_list = []
+        page_image_list = []
+        page_table_list = []
+        page_dropped_list = []
+        # 将跨页的span放到移动到下一页的列表中
+        if len(next_page_text_list) > 0:
+            page_text_list.extend(next_page_text_list)
+            next_page_text_list.clear()
+        if len(next_page_inline_equation_list) > 0:
+            page_inline_equation_list.extend(next_page_inline_equation_list)
+            next_page_inline_equation_list.clear()
+        # 构造dropped_list
+        for block in page['discarded_blocks']:
+            if block['type'] == BlockType.Discarded:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        page_dropped_list.append(span['bbox'])
+        dropped_list.append(page_dropped_list)
+        # 构造其余useful_list
+        for block in page['para_blocks']:
+            if block['type'] in [
+                    BlockType.Text,
+                    BlockType.Title,
+                    BlockType.InterlineEquation,
+            ]:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        get_span_info(span)
+            elif block['type'] in [BlockType.Image, BlockType.Table]:
+                for sub_block in block['blocks']:
+                    for line in sub_block['lines']:
+                        for span in line['spans']:
+                            get_span_info(span)
+        text_list.append(page_text_list)
+        inline_equation_list.append(page_inline_equation_list)
+        interline_equation_list.append(page_interline_equation_list)
+        image_list.append(page_image_list)
+        table_list.append(page_table_list)
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        # 获取当前页面的数据
+        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
+        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
+                                 False)
+        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
+                                 False)
+        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
+        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
+        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
+    # Save the PDF
+    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
+def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
+    dropped_bbox_list = []
+    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
+    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    pdf_docs = fitz.open('pdf', pdf_bytes)
+    magic_model = MagicModel(model_list, pdf_docs)
+    for i in range(len(model_list)):
+        page_dropped_list = []
+        tables_body, tables_caption, tables_footnote = [], [], []
+        imgs_body, imgs_caption, imgs_footnote = [], [], []
+        titles = []
+        texts = []
+        interequations = []
+        page_info = magic_model.get_model_list(i)
+        layout_dets = page_info['layout_dets']
+        for layout_det in layout_dets:
+            bbox = layout_det['bbox']
+            if layout_det['category_id'] == CategoryId.Text:
+                texts.append(bbox)
+            elif layout_det['category_id'] == CategoryId.Title:
+                titles.append(bbox)
+            elif layout_det['category_id'] == CategoryId.TableBody:
+                tables_body.append(bbox)
+            elif layout_det['category_id'] == CategoryId.TableCaption:
+                tables_caption.append(bbox)
+            elif layout_det['category_id'] == CategoryId.TableFootnote:
+                tables_footnote.append(bbox)
+            elif layout_det['category_id'] == CategoryId.ImageBody:
+                imgs_body.append(bbox)
+            elif layout_det['category_id'] == CategoryId.ImageCaption:
+                imgs_caption.append(bbox)
+            elif layout_det[
+                    'category_id'] == CategoryId.InterlineEquation_YOLO:
+                interequations.append(bbox)
+            elif layout_det['category_id'] == CategoryId.Abandon:
+                page_dropped_list.append(bbox)
+            elif layout_det['category_id'] == CategoryId.ImageFootnote:
+                imgs_footnote.append(bbox)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+        dropped_bbox_list.append(page_dropped_list)
+        imgs_footnote_list.append(imgs_footnote)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
+                              True)  # color !
+        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
+        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
+                              True)
+        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
+                              True)
+        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
+                              True)
+        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
+                              True)
+        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
+    # Save the PDF
+    pdf_docs.save(f'{out_path}/{filename}_model.pdf')
--- a/magic_pdf/libs/drop_reason.py
+++ b/magic_pdf/libs/drop_reason.py
+class DropReason:
+    TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖，导致无法准确定位文字顺序
+    USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
+    COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局，暂时不支持
+    TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
+    COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
+    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷，当前方法下计算量消耗过大
+    MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
+    Exception = "_exception" # 解析中发生异常
+    ENCRYPTED = "encrypted" # PDF是加密的
+    EMPTY_PDF = "total_page=0" # PDF页面总数为0
+    NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF，无法直接解析
+    DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
+    TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
+    TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败（例如一级、二级、三级标题）
+    PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
+    PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
+    NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
+    SPECIAL_PDF = "special_pdf"
+    PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
+    CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
+    NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
+    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
\ No newline at end of file
--- a/magic_pdf/libs/drop_tag.py
+++ b/magic_pdf/libs/drop_tag.py
+COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
+PAGE_NO = "page-no" # 页码
+CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
+VERTICAL_TEXT = 'vertical-text' # 垂直文本
+ROTATE_TEXT = 'rotate-text' # 旋转文本
+EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
+ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
+ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
+class DropTag:
+    PAGE_NUMBER = "page_no"
+    HEADER = "header"
+    FOOTER = "footer"
+    FOOTNOTE = "footnote"
+    NOT_IN_LAYOUT = "not_in_layout"
+    SPAN_OVERLAP = "span_overlap"
+    BLOCK_OVERLAP = "block_overlap"
--- a/magic_pdf/libs/hash_utils.py
+++ b/magic_pdf/libs/hash_utils.py
+import hashlib
+def compute_md5(file_bytes):
+    hasher = hashlib.md5()
+    hasher.update(file_bytes)
+    return hasher.hexdigest().upper()
+def compute_sha256(input_string):
+    hasher = hashlib.sha256()
+    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
+    input_bytes = input_string.encode('utf-8')
+    hasher.update(input_bytes)
+    return hasher.hexdigest()
--- a/magic_pdf/libs/json_compressor.py
+++ b/magic_pdf/libs/json_compressor.py
+import json
+import brotli
+import base64
+class JsonCompressor:
+    @staticmethod
+    def compress_json(data):
+        """
+        Compress a json object and encode it with base64
+        """
+        json_str = json.dumps(data)
+        json_bytes = json_str.encode('utf-8')
+        compressed = brotli.compress(json_bytes, quality=6)
+        compressed_str = base64.b64encode(compressed).decode('utf-8')  # convert bytes to string
+        return compressed_str
+    @staticmethod
+    def decompress_json(compressed_str):
+        """
+        Decode the base64 string and decompress the json object
+        """
+        compressed = base64.b64decode(compressed_str.encode('utf-8'))  # convert string to bytes
+        decompressed_bytes = brotli.decompress(compressed)
+        json_str = decompressed_bytes.decode('utf-8')
+        data = json.loads(json_str)
+        return data
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
+import os
+import unicodedata
+if not os.getenv("FTLANG_CACHE"):
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    root_dir = os.path.dirname(current_dir)
+    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
+    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
+    # print(os.getenv("FTLANG_CACHE"))
+from fast_langdetect import detect_language
+def detect_lang(text: str) -> str:
+    if len(text) == 0:
+        return ""
+    try:
+        lang_upper = detect_language(text)
+    except:
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_language(html_no_ctrl_chars)
+    try:
+        lang = lang_upper.lower()
+    except:
+        lang = ""
+    return lang
+if __name__ == '__main__':
+    print(os.getenv("FTLANG_CACHE"))
+    print(detect_lang("This is a test."))
+    print(detect_lang("<html>This is a test</html>"))
+    print(detect_lang("这个是中文测试。"))
+    print(detect_lang("<html>这个是中文测试。</html>"))
--- a/magic_pdf/libs/local_math.py
+++ b/magic_pdf/libs/local_math.py
+def float_gt(a, b):
+    if 0.0001 >= abs(a -b):
+        return False
+    return a > b
+def float_equal(a, b):
+    if 0.0001 >= abs(a-b):
+        return True
+    return False
\ No newline at end of file
--- a/magic_pdf/libs/markdown_utils.py
+++ b/magic_pdf/libs/markdown_utils.py
+import re
+def escape_special_markdown_char(pymu_blocks):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for blk in pymu_blocks:
+        for line in blk['lines']:
+            for span in line['spans']:
+                for char in special_chars:
+                    span_text = span['text']
+                    span_type = span.get("_type", None)
+                    if span_type in ['inline-equation', 'interline-equation']:
+                        continue
+                    elif span_text:
+                        span['text'] = span['text'].replace(char, "\\" + char)
+    return pymu_blocks
+def ocr_escape_special_markdown_char(content):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for char in special_chars:
+        content = content.replace(char, "\\" + char)
+    return content
--- a/magic_pdf/libs/nlp_utils.py
+++ b/magic_pdf/libs/nlp_utils.py
+import re
+from os import path
+from collections import Counter
+from loguru import logger
+# from langdetect import detect
+import spacy
+import en_core_web_sm
+import zh_core_web_sm
+from magic_pdf.libs.language import detect_lang
+class NLPModels:
+    """
+    How to upload local models to s3:
+        - config aws cli:
+            doc\SETUP-CLI.md
+            doc\setup_cli.sh
+            app\config\__init__.py
+        - $ cd {local_dir_storing_models}
+        - $ ls models
+            en_core_web_sm-3.7.1/
+            zh_core_web_sm-3.7.0/
+        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
+        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
+            PRE en_core_web_sm-3.7.1/
+            PRE zh_core_web_sm-3.7.0/
+    """
+    def __init__(self):
+        # if OS is windows, set "TMP_DIR" to "D:/tmp"
+        home_dir = path.expanduser("~")
+        self.default_local_path = path.join(home_dir, ".nlp_models")
+        self.default_shared_path = "/share/pdf_processor/nlp_models"
+        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
+        self.default_s3_path = "s3://llm-infra/models"
+        self.nlp_models = self.nlp_models = {
+            "en_core_web_sm": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "en_core_web_md": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "en_core_web_lg": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "zh_core_web_sm": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+            "zh_core_web_md": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+            "zh_core_web_lg": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+        }
+        self.en_core_web_sm_model = en_core_web_sm.load()
+        self.zh_core_web_sm_model = zh_core_web_sm.load()
+    def load_model(self, model_name, model_type, model_version):
+        if (
+            model_name in self.nlp_models
+            and self.nlp_models[model_name]["type"] == model_type
+            and self.nlp_models[model_name]["version"] == model_version
+        ):
+            return spacy.load(model_name) if spacy.util.is_package(model_name) else None
+        else:
+            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
+            return None
+    def detect_language(self, text, use_langdetect=False):
+        if len(text) == 0:
+            return None
+        if use_langdetect:
+            # print("use_langdetect")
+            # print(detect_lang(text))
+            # return detect_lang(text)
+            if detect_lang(text) == "zh":
+                return "zh"
+            else:
+                return "en"
+        if not use_langdetect:
+            en_count = len(re.findall(r"[a-zA-Z]", text))
+            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
+            if en_count > cn_count:
+                return "en"
+            if cn_count > en_count:
+                return "zh"
+    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
+        """
+        Detect entity categories using NLP models and return the most frequent entity types.
+        Parameters
+        ----------
+        text : str
+            Text to be processed.
+        Returns
+        -------
+        str
+            The most frequent entity type.
+        """
+        lang = self.detect_language(text, use_langdetect=True)
+        if lang == "en":
+            nlp_model = self.en_core_web_sm_model
+        elif lang == "zh":
+            nlp_model = self.zh_core_web_sm_model
+        else:
+            # logger.error(f"Unsupported language: {lang}")
+            return {}
+        # Splitting text into smaller parts
+        text_parts = re.split(r"[,;，；、\s & |]+", text)
+        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
+        text_combined = " ".join(text_parts)
+        try:
+            doc = nlp_model(text_combined)
+            entity_counts = Counter([ent.label_ for ent in doc.ents])
+            word_counts_in_entities = Counter()
+            for ent in doc.ents:
+                word_counts_in_entities[ent.label_] += len(ent.text.split())
+            total_words_in_entities = sum(word_counts_in_entities.values())
+            total_words = len([token for token in doc if not token.is_punct])
+            if total_words_in_entities == 0 or total_words == 0:
+                return None
+            entity_percentage = total_words_in_entities / total_words
+            if entity_percentage < 0.5:
+                return None
+            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
+            entity_percentage = word_count / total_words_in_entities
+            if entity_percentage >= threshold:
+                return most_common_entity
+            else:
+                return None
+        except Exception as e:
+            logger.error(f"Error in entity detection: {e}")
+            return None
+def __main__():
+    nlpModel = NLPModels()
+    test_strings = [
+        "张三",
+        "张三, 李四，王五; 赵六",
+        "John Doe",
+        "Jane Smith",
+        "Lee, John",
+        "John Doe, Jane Smith; Alice Johnson，Bob Lee",
+        "孙七, Michael Jordan；赵八",
+        "David Smith  Michael O'Connor; Kevin ßáçøñ",
+        "李雷·韩梅梅, 张三·李四",
+        "Charles Robert Darwin, Isaac Newton",
+        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
+        "John Doe, Jane Smith; Alice Johnson",
+        "张三, 李四，王五; 赵六",
+        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
+        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
+        "Claire Chabut* and Jean-François Bussières",
+        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
+        "Changchun",
+        "china",
+        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
+        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
+        "Synergistic Effect of Supported Nickel Catalyst with",
+        "Intumescent Flame-Retardants on Flame Retardancy",
+        "and Thermal Stability of Polypropylene",
+    ]
+    for test in test_strings:
+        print()
+        print(f"Original String: {test}")
+        result = nlpModel.detect_entity_catgr_using_nlp(test)
+        print(f"Detected entities: {result}")
+if __name__ == "__main__":
+    __main__()
--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
+class ContentType:
+    Image = 'image'
+    Table = 'table'
+    Text = 'text'
+    InlineEquation = 'inline_equation'
+    InterlineEquation = 'interline_equation'
+class BlockType:
+    Image = 'image'
+    ImageBody = 'image_body'
+    ImageCaption = 'image_caption'
+    ImageFootnote = 'image_footnote'
+    Table = 'table'
+    TableBody = 'table_body'
+    TableCaption = 'table_caption'
+    TableFootnote = 'table_footnote'
+    Text = 'text'
+    Title = 'title'
+    InterlineEquation = 'interline_equation'
+    Footnote = 'footnote'
+    Discarded = 'discarded'
+class CategoryId:
+    Title = 0
+    Text = 1
+    Abandon = 2
+    ImageBody = 3
+    ImageCaption = 4
+    TableBody = 5
+    TableCaption = 6
+    TableFootnote = 7
+    InterlineEquation_Layout = 8
+    InlineEquation = 13
+    InterlineEquation_YOLO = 14
+    OcrText = 15
+    ImageFootnote = 101
--- a/magic_pdf/libs/path_utils.py
+++ b/magic_pdf/libs/path_utils.py
+def remove_non_official_s3_args(s3path):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
+    """
+    arr = s3path.split("?")
+    return arr[0]
+def parse_s3path(s3path: str):
+    # from s3pathlib import S3Path
+    # p = S3Path(remove_non_official_s3_args(s3path))
+    # return p.bucket, p.key
+    s3path = remove_non_official_s3_args(s3path).strip()
+    if s3path.startswith(('s3://', 's3a://')):
+        prefix, path = s3path.split('://', 1)
+        bucket_name, key = path.split('/', 1)
+        return bucket_name, key
+    elif s3path.startswith('/'):
+        raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
+    else:
+        raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
+def parse_s3_range_params(s3path: str):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
+    """
+    arr = s3path.split("?bytes=")
+    if len(arr) == 1:
+        return None
+    return arr[1].split(",")
--- a/magic_pdf/libs/pdf_check.py
+++ b/magic_pdf/libs/pdf_check.py
+from io import BytesIO
+import re
+import fitz
+import numpy as np
+from loguru import logger
+from pdfminer.high_level import extract_text
+def calculate_sample_count(total_page: int):
+    """
+    根据总页数和采样率计算采样页面的数量。
+    """
+    select_page_cnt = min(10, total_page)
+    return select_page_cnt
+def extract_pages(src_pdf_bytes: bytes):
+    pdf_docs = fitz.open("pdf", src_pdf_bytes)
+    total_page = len(pdf_docs)
+    if total_page == 0:
+        # 如果PDF没有页面，直接返回空文档
+        logger.warning("PDF is empty, return empty document")
+        return fitz.Document()
+    select_page_cnt = calculate_sample_count(total_page)
+    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
+    sample_docs = fitz.Document()
+    try:
+        for index in page_num:
+            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
+    except Exception as e:
+        logger.exception(e)
+    return sample_docs
+def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
+    """"
+    检测PDF中是否包含非法字符
+    """
+    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+    sample_docs = extract_pages(src_pdf_bytes)
+    sample_pdf_bytes = sample_docs.tobytes()
+    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
+    text = extract_text(sample_pdf_file_like_object)
+    text = text.replace("\n", "")
+    # logger.info(text)
+    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
+    cid_pattern = re.compile(r'\(cid:\d+\)')
+    matches = cid_pattern.findall(text)
+    cid_count = len(matches)
+    cid_len = sum(len(match) for match in matches)
+    text_len = len(text)
+    if text_len == 0:
+        cid_chars_radio = 0
+    else:
+        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
+    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
+    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
+    if cid_chars_radio > 0.05:
+        return False  # 乱码文档
+    else:
+        return True   # 正常文档
--- a/magic_pdf/libs/pdf_image_tools.py
+++ b/magic_pdf/libs/pdf_image_tools.py
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.hash_utils import compute_sha256
+def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
+    """
+    从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径
+    save_path：需要同时支持s3和本地, 图片存放在save_path下，文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
+    """
+    # 拼接文件名
+    filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
+    # 老版本返回不带bucket的路径
+    img_path = join_path(return_path, filename) if return_path is not None else None
+    # 新版本生成平铺路径
+    img_hash256_path = f"{compute_sha256(img_path)}.jpg"
+    # 将坐标转换为fitz.Rect对象
+    rect = fitz.Rect(*bbox)
+    # 配置缩放倍数为3倍
+    zoom = fitz.Matrix(3, 3)
+    # 截取图片
+    pix = page.get_pixmap(clip=rect, matrix=zoom)
+    byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
+    imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
+    return img_hash256_path
--- a/magic_pdf/libs/safe_filename.py
+++ b/magic_pdf/libs/safe_filename.py
+import os
+def sanitize_filename(filename, replacement="_"):
+    if os.name == 'nt':
+        invalid_chars = '<>:"|?*'
+        for char in invalid_chars:
+            filename = filename.replace(char, replacement)
+    return filename
--- a/magic_pdf/libs/textbase.py
+++ b/magic_pdf/libs/textbase.py
+import math
+def __inc_dict_val(mp, key, val_inc:int):
+    if mp.get(key):
+        mp[key] = mp[key] + val_inc
+    else:
+        mp[key] = val_inc
+def get_text_block_base_info(block):
+    """
+    获取这个文本块里的字体的颜色、字号、字体
+    按照正文字数最多的返回
+    """
+    counter = {}
+    for line in block['lines']:
+        for span in line['spans']:
+            color = span['color']
+            size = round(span['size'], 2)
+            font = span['font']
+            txt_len = len(span['text'])
+            __inc_dict_val(counter, (color, size, font), txt_len)
+    c, s, ft = max(counter, key=counter.get)
+    return c, s, ft
\ No newline at end of file