Merge branch 'master' of github.com:papayalove/Magic-PDF

fe58649b · liukaiwen · d876cbe8 · e9843e15 · fe58649b · fe58649b
Commit fe58649b authored Apr 15, 2024 by liukaiwen
15 changed files
--- a/magic_pdf/pipeline_ocr.bak
+++ b/magic_pdf/pipeline_ocr.bak
@@ -17,7 +17,7 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
@@ -45,7 +45,7 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, mode, debug_mode=
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
@@ -78,7 +78,7 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, de
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
@@ -108,7 +108,7 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
@@ -137,7 +137,7 @@ def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) ->
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
@@ -165,7 +165,7 @@ def ocr_pdf_intermediate_dict_to_standard_format_with_para(jso: dict, debug_mode
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
@@ -221,7 +221,7 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_

 # 专门用来跑被drop的pdf，跑完之后需要把need_drop字段置为false
 def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
-    if not jso.get("need_drop", False):
+    if not jso.get("_need_drop", False):
        return jso
    else:
        try:
@@ -233,7 +233,7 @@ def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
            )
            jso["pdf_intermediate_dict"] = JsonCompressor.compress_json(pdf_info_dict)
            jso["parse_time"] = parse_time
-            jso["need_drop"] = False
+            jso["_need_drop"] = False
        except Exception as e:
            jso = exception_handler(jso, e)
        return jso
@@ -244,7 +244,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            return jso
    try:
        pdf_bytes = get_pdf_bytes(jso)

--- a/magic_pdf/pipeline_txt.bak
+++ b/magic_pdf/pipeline_txt.bak
@@ -18,7 +18,7 @@ def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop")
            jso["dropped"] = True
@@ -46,7 +46,7 @@ def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
    if debug_mode:
        pass
    else:  # 如果debug没开，则检测是否有needdrop字段
-        if jso.get("need_drop", False):
+        if jso.get("_need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop")
            jso["dropped"] = True

--- a/magic_pdf/post_proc/pdf_post_filter.py
+++ b/magic_pdf/post_proc/pdf_post_filter.py
@@ -62,6 +62,6 @@ def pdf_post_filter(page_info) -> tuple:
    """
    bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
    if bool_is_pseudo_single_column:
-        return False, {"need_drop": True, "drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
+        return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}

    return True, None
\ No newline at end of file
--- a/magic_pdf/pre_proc/detect_footnote.py
+++ b/magic_pdf/pre_proc/detect_footnote.py
@@ -3,7 +3,7 @@ from magic_pdf.libs.commons import fitz             # pyMuPDF库
 from magic_pdf.libs.coordinate_transform import get_scale_ratio


-def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
+def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
    """
    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
    :param page :fitz读取的当前页的内容

--- a/magic_pdf/pre_proc/ocr_cut_image.py
+++ b/magic_pdf/pre_proc/ocr_cut_image.py
@@ -3,18 +3,16 @@ from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.pdf_image_tools import cut_image


-def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
-    def s3_return_path(type):
-        return join_path(book_name, type)
+def cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):

-    def img_save_path(type):
-        return join_path(save_path, s3_return_path(type))
+    def return_path(type):
+        return join_path(pdf_bytes_md5, type)

    for span in spans:
        span_type = span['type']
        if span_type == ContentType.Image:
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
+            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), imageWriter=imageWriter)
        elif span_type == ContentType.Table:
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
+            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), imageWriter=imageWriter)

    return spans
--- a/magic_pdf/pre_proc/pdf_pre_filter.py
+++ b/magic_pdf/pre_proc/pdf_pre_filter.py
@@ -68,7 +68,7 @@ def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple
        
    """
    if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
-        return False, {"need_drop": True, "drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
+        return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}

    
    return True, None
\ No newline at end of file
--- a/magic_pdf/spark/UNIPipe.py
+++ b/magic_pdf/spark/UNIPipe.py
+from loguru import logger
+
+from magic_pdf.dict2md.mkcontent import mk_universal_format
+from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para
+from magic_pdf.filter.pdf_classify_by_type import classify
+from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
+from magic_pdf.libs.detect_language_from_model import get_language_from_model
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.json_compressor import JsonCompressor
+from magic_pdf.spark.spark_api import parse_union_pdf, parse_ocr_pdf
+
+
+class UNIPipe:
+    def __init__(self):
+        pass
+
+    def classify(self, pdf_bytes: bytes) -> str:
+        """
+        根据pdf的元数据，判断是否是文本pdf，还是ocr pdf
+        """
+        pdf_meta = pdf_meta_scan(pdf_bytes)
+        if pdf_meta.get("_need_drop", False):  # 如果返回了需要丢弃的标志，则抛出异常
+            raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
+        else:
+            is_encrypted = pdf_meta["is_encrypted"]
+            is_needs_password = pdf_meta["is_needs_password"]
+            if is_encrypted or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
+                raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
+            else:
+                is_text_pdf, results = classify(
+                    pdf_meta["total_page"],
+                    pdf_meta["page_width_pts"],
+                    pdf_meta["page_height_pts"],
+                    pdf_meta["image_info_per_page"],
+                    pdf_meta["text_len_per_page"],
+                    pdf_meta["imgs_per_page"],
+                    pdf_meta["text_layout_per_page"],
+                )
+                if is_text_pdf:
+                    return "txt"
+                else:
+                    return "ocr"
+
+    def parse(self, pdf_bytes: bytes, image_writer, jso_useful_key) -> dict:
+        """
+        根据pdf类型，解析pdf
+        """
+        text_language = get_language_from_model(jso_useful_key['model_list'])
+        allow_language = ["zh", "en"]  # 允许的语言,目前只允许简中和英文的
+        logger.info(f"pdf text_language is {text_language}")
+        if text_language not in allow_language:  # 如果语言不在允许的语言中，则drop
+            raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.NOT_ALLOW_LANGUAGE}")
+        else:
+            if jso_useful_key['_pdf_type'] == "txt":
+                pdf_mid_data = parse_union_pdf(pdf_bytes, jso_useful_key['model_list'], image_writer)
+            elif jso_useful_key['_pdf_type'] == "ocr":
+                pdf_mid_data = parse_ocr_pdf(pdf_bytes, jso_useful_key['model_list'], image_writer)
+            else:
+                raise Exception(f"pdf type is not txt or ocr")
+            return JsonCompressor.compress(pdf_mid_data)
+
+    def mk_uni_format(self, pdf_mid_data: str, img_buket_path: str) -> list:
+        """
+        根据pdf类型，生成统一格式content_list
+        """
+        pdf_mid_data = JsonCompressor.decompress_json(pdf_mid_data)
+        parse_type = pdf_mid_data["_parse_type"]
+        if parse_type == "txt":
+            content_list = mk_universal_format(pdf_mid_data, img_buket_path)
+        elif parse_type == "ocr":
+            content_list = make_standard_format_with_para(pdf_mid_data, img_buket_path)
+        return content_list
+
+
+if __name__ == '__main__':
+    # 测试
+    pipe = UNIPipe()
+    pdf_bytes = open(r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\数学新星网\edu_00001544.pdf",
+                     "rb").read()
+    pdf_type = pipe.classify(pdf_bytes)
+    logger.info(f"pdf_type is {pdf_type}")
--- a/magic_pdf/spark/base.py
+++ b/magic_pdf/spark/base.py
@@ -26,9 +26,9 @@ def get_bookid(jso: dict):

 def exception_handler(jso: dict, e):
    logger.exception(e)
-    jso["need_drop"] = True
-    jso["drop_reason"] = DropReason.Exception
-    jso["exception"] = f"ERROR: {e}"
+    jso["_need_drop"] = True
+    jso["_drop_reason"] = DropReason.Exception
+    jso["_exception"] = f"ERROR: {e}"
    return jso



--- a/magic_pdf/spark/spark_api.py
+++ b/magic_pdf/spark/spark_api.py
@@ -12,27 +12,86 @@
 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖！！！

 """
-
+from loguru import logger

 from magic_pdf.io import AbsReaderWriter
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt


 def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
    """
    解析文本类pdf
    """
-    pass
+    pdf_info_dict = parse_pdf_by_txt(
+        pdf_bytes,
+        pdf_models,
+        imageWriter,
+        start_page_id=start_page,
+        debug_mode=is_debug,
+    )
+
+    pdf_info_dict["parse_type"] = "txt"

+    return pdf_info_dict

-def parse_ocr_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
+
+def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
    """
    解析ocr类pdf
    """
-    pass
+    pdf_info_dict = parse_pdf_by_ocr(
+        pdf_bytes,
+        pdf_models,
+        imageWriter,
+        start_page_id=start_page,
+        debug_mode=is_debug,
+    )
+
+    pdf_info_dict["_parse_type"] = "ocr"

+    return pdf_info_dict

-def parse_union_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,  *args, **kwargs):
+
+def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,  *args, **kwargs):
    """
    ocr和文本混合的pdf，全部解析出来
    """
-    pass
\ No newline at end of file
+    def parse_pdf(method):
+        try:
+            return method(
+                pdf_bytes,
+                pdf_models,
+                imageWriter,
+                start_page_id=start_page,
+                debug_mode=is_debug,
+            )
+        except Exception as e:
+            logger.error(f"{method.__name__} error: {e}")
+            return None
+
+    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
+
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
+        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
+        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
+        if pdf_info_dict is None:
+            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
+        else:
+            pdf_info_dict["_parse_type"] = "ocr"
+    else:
+        pdf_info_dict["_parse_type"] = "txt"
+
+    return pdf_info_dict
+
+
+def spark_json_extractor(jso: dict) -> dict:
+
+    """
+    从json中提取数据，返回一个dict
+    """
+
+    return {
+        "_pdf_type": jso["_pdf_type"],
+        "model_list": jso["doc_layout_result"],
+    }
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,4 +15,6 @@ wordninja>=2.0.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
 zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl
 scikit-learn==1.4.1.post1
-nltk==3.8.1
\ No newline at end of file
+nltk==3.8.1
+s3pathlib>=2.1.1
+
--- a/tools/base_data.json
+++ b/tools/base_data.json
--- a/tools/base_data_text.json
+++ b/tools/base_data_text.json
+{
+    "accuracy": 1.0,
+    "precision": 1.0,
+    "recall": 1.0,
+    "f1_score": 1.0,
+    "pdf间的平均编辑距离": 19.82051282051282,
+    "pdf间的平均bleu": 0.9002485609584511,
+    "阅读顺序编辑距离": 0.3176895306859206,
+    "分段准确率": 0.8989169675090253,
+    "行内公式准确率": {
+        "accuracy": 0.9782741738066095,
+        "precision": 0.9782741738066095,
+        "recall": 1.0,
+        "f1_score": 0.9890177880897139
+    },
+    "行内公式编辑距离": 0.0,
+    "行内公式bleu": 0.20340450120213166,
+    "行间公式准确率": {
+        "accuracy": 1.0,
+        "precision": 1.0,
+        "recall": 1.0,
+        "f1_score": 1.0
+    },
+    "行间公式编辑距离": 0.0,
+    "行间公式bleu": 0.3662262622386575,
+    "丢弃文本准确率": {
+        "accuracy": 0.867870036101083,
+        "precision": 0.9064856711915535,
+        "recall": 0.9532117367168914,
+        "f1_score": 0.9292616930807885
+    },
+    "丢弃文本标签准确率": {
+        "color_background_header_txt_block": {
+            "precision": 0.0,
+            "recall": 0.0,
+            "f1-score": 0.0,
+            "support": 41.0
+        },
+        "rotate": {
+            "precision": 1.0,
+            "recall": 0.9682539682539683,
+            "f1-score": 0.9838709677419355,
+            "support": 63.0
+        },
+        "footnote": {
+            "precision": 1.0,
+            "recall": 0.883495145631068,
+            "f1-score": 0.9381443298969072,
+            "support": 103.0
+        },
+        "header": {
+            "precision": 1.0,
+            "recall": 1.0,
+            "f1-score": 1.0,
+            "support": 4.0
+        },
+        "on-image": {
+            "precision": 0.9947643979057592,
+            "recall": 1.0,
+            "f1-score": 0.9973753280839895,
+            "support": 380.0
+        },
+        "on-table": {
+            "precision": 1.0,
+            "recall": 0.9443609022556391,
+            "f1-score": 0.97138437741686,
+            "support": 665.0
+        },
+        "micro avg": {
+            "precision": 0.9982847341337907,
+            "recall": 0.9267515923566879,
+            "f1-score": 0.9611890999174236,
+            "support": 1256.0
+        }
+    },
+    "丢弃图片准确率": {
+        "accuracy": 0.8666666666666667,
+        "precision": 0.9285714285714286,
+        "recall": 0.9285714285714286,
+        "f1_score": 0.9285714285714286
+    },
+    "丢弃表格准确率": {
+        "accuracy": 0,
+        "precision": 0,
+        "recall": 0,
+        "f1_score": 0
+    }
+}
\ No newline at end of file
--- a/tools/json_files.zip
+++ b/tools/json_files.zip
--- a/tools/ocr_badcase.py
+++ b/tools/ocr_badcase.py
@@ -432,75 +432,8 @@ def handle_multi_deletion(test_page, test_page_tag, test_page_bbox, standard_pag



-def check_json_files_in_zip_exist(zip_file_path, standard_json_path_in_zip, test_json_path_in_zip):
-    """
-    检查ZIP文件中是否存在指定的JSON文件
-    """
-    with zipfile.ZipFile(zip_file_path, 'r') as z:
-        # 获取ZIP文件中所有文件的列表
-        all_files_in_zip = z.namelist()
-        # 检查标准文件和测试文件是否都在ZIP文件中
-        if standard_json_path_in_zip not in all_files_in_zip or test_json_path_in_zip not in all_files_in_zip:
-            raise FileNotFoundError("One or both of the required JSON files are missing from the ZIP archive.")


-def read_json_files_from_streams(standard_file_stream, test_file_stream):
-    """
-    从文件流中读取JSON文件内容
-    """
-    pdf_json_standard = [json.loads(line) for line in standard_file_stream]
-    pdf_json_test = [json.loads(line) for line in test_file_stream]
-
-    json_standard_origin = pd.DataFrame(pdf_json_standard)
-    json_test_origin = pd.DataFrame(pdf_json_test)
-
-    return json_standard_origin, json_test_origin
-
-def read_json_files_from_zip(zip_file_path, standard_json_path_in_zip, test_json_path_in_zip):
-    """
-    从ZIP文件中读取两个JSON文件并返回它们的DataFrame
-    """
-    with zipfile.ZipFile(zip_file_path, 'r') as z:
-        with z.open(standard_json_path_in_zip) as standard_file_stream, \
-             z.open(test_json_path_in_zip) as test_file_stream:
-
-            standard_file_text_stream = TextIOWrapper(standard_file_stream, encoding='utf-8')
-            test_file_text_stream = TextIOWrapper(test_file_stream, encoding='utf-8')
-
-            json_standard_origin, json_test_origin = read_json_files_from_streams(
-                standard_file_text_stream, test_file_text_stream
-            )
-    
-    return json_standard_origin, json_test_origin
-
-
-def merge_json_data(json_test_df, json_standard_df):
-    """
-    基于ID合并测试和标准数据集，并返回合并后的数据及存在性检查结果。
-
-    参数:
-    - json_test_df: 测试数据的DataFrame。
-    - json_standard_df: 标准数据的DataFrame。
-
-    返回:
-    - inner_merge: 内部合并的DataFrame，包含匹配的数据行。
-    - standard_exist: 标准数据存在性的Series。
-    - test_exist: 测试数据存在性的Series。
-    """
-    test_data = json_test_df[['id', 'mid_json']].drop_duplicates(subset='id', keep='first').reset_index(drop=True)
-    standard_data = json_standard_df[['id', 'mid_json', 'pass_label']].drop_duplicates(subset='id', keep='first').reset_index(drop=True)
-
-    outer_merge = pd.merge(test_data, standard_data, on='id', how='outer')
-    outer_merge.columns = ['id', 'test_mid_json', 'standard_mid_json', 'pass_label']
-
-    standard_exist = outer_merge.standard_mid_json.notnull()
-    test_exist = outer_merge.test_mid_json.notnull()
-
-    inner_merge = pd.merge(test_data, standard_data, on='id', how='inner')
-    inner_merge.columns = ['id', 'test_mid_json', 'standard_mid_json', 'pass_label']
-
-    return inner_merge, standard_exist, test_exist
-

 def consolidate_data(test_data, standard_data, key_path):
    """
@@ -533,6 +466,20 @@ def consolidate_data(test_data, standard_data, key_path):
    return overall_data_standard, overall_data_test

 def overall_calculate_metrics(inner_merge, json_test, json_standard,standard_exist, test_exist):
+    """
+    计算整体的指标，包括准确率、精确率、召回率、F1值、平均编辑距离、平均BLEU得分、分段准确率、公式准确率、公式编辑距离、公式BLEU、丢弃文本准确率、丢弃文本标签准确率、丢弃图片准确率、丢弃表格准确率等。
+    
+    Args:
+        inner_merge (dict): 包含merge信息的字典，包括pass_label和id等信息。
+        json_test (dict): 测试集的json数据。
+        json_standard (dict): 标准集的json数据。
+        standard_exist (list): 标准集中存在的id列表。
+        test_exist (list): 测试集中存在的id列表。
+    
+    Returns:
+        dict: 包含整体指标值的字典。
+    
+    """

    process_data_standard = process_equations_and_blocks(json_standard, is_standard=True)
    process_data_test = process_equations_and_blocks(json_test, is_standard=False)
@@ -739,9 +686,77 @@ def calculate_metrics(inner_merge, json_test, json_standard, json_standard_origi

    return result_dict

+def check_json_files_in_zip_exist(zip_file_path, standard_json_path_in_zip, test_json_path_in_zip):
+    """
+    检查ZIP文件中是否存在指定的JSON文件
+    """
+    with zipfile.ZipFile(zip_file_path, 'r') as z:
+        # 获取ZIP文件中所有文件的列表
+        all_files_in_zip = z.namelist()
+        # 检查标准文件和测试文件是否都在ZIP文件中
+        if standard_json_path_in_zip not in all_files_in_zip or test_json_path_in_zip not in all_files_in_zip:
+            raise FileNotFoundError("One or both of the required JSON files are missing from the ZIP archive.")
+
+
+
+def read_json_files_from_streams(standard_file_stream, test_file_stream):
+    """
+    从文件流中读取JSON文件内容
+    """
+    pdf_json_standard = [json.loads(line) for line in standard_file_stream]
+    pdf_json_test = [json.loads(line) for line in test_file_stream]
+
+    json_standard_origin = pd.DataFrame(pdf_json_standard)
+    json_test_origin = pd.DataFrame(pdf_json_test)

+    return json_standard_origin, json_test_origin

-def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
+def read_json_files_from_zip(zip_file_path, standard_json_path_in_zip, test_json_path_in_zip):
+    """
+    从ZIP文件中读取两个JSON文件并返回它们的DataFrame
+    """
+    with zipfile.ZipFile(zip_file_path, 'r') as z:
+        with z.open(standard_json_path_in_zip) as standard_file_stream, \
+             z.open(test_json_path_in_zip) as test_file_stream:
+
+            standard_file_text_stream = TextIOWrapper(standard_file_stream, encoding='utf-8')
+            test_file_text_stream = TextIOWrapper(test_file_stream, encoding='utf-8')
+
+            json_standard_origin, json_test_origin = read_json_files_from_streams(
+                standard_file_text_stream, test_file_text_stream
+            )
+    
+    return json_standard_origin, json_test_origin
+
+
+def merge_json_data(json_test_df, json_standard_df):
+    """
+    基于ID合并测试和标准数据集，并返回合并后的数据及存在性检查结果。
+
+    参数:
+    - json_test_df: 测试数据的DataFrame。
+    - json_standard_df: 标准数据的DataFrame。
+
+    返回:
+    - inner_merge: 内部合并的DataFrame，包含匹配的数据行。
+    - standard_exist: 标准数据存在性的Series。
+    - test_exist: 测试数据存在性的Series。
+    """
+    test_data = json_test_df[['id', 'mid_json']].drop_duplicates(subset='id', keep='first').reset_index(drop=True)
+    standard_data = json_standard_df[['id', 'mid_json', 'pass_label']].drop_duplicates(subset='id', keep='first').reset_index(drop=True)
+
+    outer_merge = pd.merge(test_data, standard_data, on='id', how='outer')
+    outer_merge.columns = ['id', 'test_mid_json', 'standard_mid_json', 'pass_label']
+
+    standard_exist = outer_merge.standard_mid_json.notnull()
+    test_exist = outer_merge.test_mid_json.notnull()
+
+    inner_merge = pd.merge(test_data, standard_data, on='id', how='inner')
+    inner_merge.columns = ['id', 'test_mid_json', 'standard_mid_json', 'pass_label']
+
+    return inner_merge, standard_exist, test_exist
+
+def save_results(result_dict,overall_report_dict,badcase_path,overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url):
    """
    将结果字典保存为JSON文件至指定路径。

@@ -749,35 +764,46 @@ def save_results(result_dict,overall_report_dict,badcase_path,overall_path,):
    - result_dict: 包含计算结果的字典。
    - overall_path: 结果文件的保存路径，包括文件名。
    """
+    with open(overall_path, 'w', encoding='utf-8') as f:
+    # 将结果字典转换为JSON格式并写入文件
+        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
+    final_overall_path = upload_to_s3(overall_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    overall_path_res = "OCR抽取方案整体评测指标结果请查看：" + final_overall_path
+    print(f'\033[31m{overall_path_res}\033[0m')
    # 打开指定的文件以写入
    with open(badcase_path, 'w', encoding='utf-8') as f:
        # 将结果字典转换为JSON格式并写入文件
        json.dump(result_dict, f, ensure_ascii=False, indent=4)
+    final_badcase_path = upload_to_s3(badcase_path, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+    badcase_path_res = "OCR抽取方案评测badcase输出报告查看：" + final_badcase_path
+    print(f'\033[31m{badcase_path_res}\033[0m')

-    print(f"计算结果已经保存到文件：{badcase_path}")
-
-    with open(overall_path, 'w', encoding='utf-8') as f:
-    # 将结果字典转换为JSON格式并写入文件
-        json.dump(overall_report_dict, f, ensure_ascii=False, indent=4)
-
-    print(f"计算结果已经保存到文件：{overall_path}")
-
-def upload_to_s3(file_path, bucket_name, s3_file_name,AWS_ACCESS_KEY,AWS_SECRET_KEY,END_POINT_URL):
+def upload_to_s3(file_path, bucket_name, s3_directory, AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL):
    """
    上传文件到Amazon S3
    """
-    s3 = boto3.client('s3',aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY,endpoint_url=END_POINT_URL)
+    # 创建S3客户端
+    s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY, endpoint_url=END_POINT_URL)
    try:
+        # 从文件路径中提取文件名
+        file_name = os.path.basename(file_path)
+        
+        # 创建S3对象键，将s3_directory和file_name连接起来
+        s3_object_key = f"{s3_directory}/{file_name}"  # 使用斜杠直接连接
+        
        # 上传文件到S3
-        s3.upload_file(file_path, bucket_name, s3_file_name)
-        print(f"文件 {s3_file_name} 成功上传到S3存储桶 {bucket_name} 中的路径 {file_path}")
+        s3.upload_file(file_path, bucket_name, s3_object_key)
+        s3_path = f"http://st.bigdata.shlab.tech/S3_Browser?output_path=s3://{bucket_name}/{s3_directory}/{file_name}"
+        return s3_path
+        #print(f"文件 {file_path} 成功上传到S3存储桶 {bucket_name} 中的目录 {s3_directory}，文件名为 {file_name}")
    except FileNotFoundError:
-        print(f"文件 {s3_file_name} 未找到，请检查文件路径是否正确。")
+        print(f"文件 {file_path} 未找到，请检查文件路径是否正确。")
    except NoCredentialsError:
        print("无法找到AWS凭证，请确认您的AWS访问密钥和密钥ID是否正确。")
    except ClientError as e:
        print(f"上传文件时发生错误：{e}")

+
 def generate_filename(badcase_path,overall_path):
    """
    生成带有当前时间戳的输出文件名。
@@ -808,7 +834,8 @@ def compare_edit_distance(json_file, overall_report):



-def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_data_path,s3_bucket_name=None, s3_file_name=None, AWS_ACCESS_KEY=None, AWS_SECRET_KEY=None, END_POINT_URL=None):
+def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_data_path, s3_bucket_name=None, s3_file_directory=None, 
+         aws_access_key=None, aws_secret_key=None, end_point_url=None):
    """
    主函数，执行整个评估流程。
    
@@ -819,7 +846,7 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
    - badcase_path: badcase文件的基础路径和文件名前缀。
    - overall_path: overall文件的基础路径和文件名前缀。
    - s3_bucket_name: S3桶名称（可选）。
-    - s3_file_name: S3上的文件名（可选）。
+    - s3_file_directory: S3上的文件保存目录（可选）。
    - AWS_ACCESS_KEY, AWS_SECRET_KEY, END_POINT_URL: AWS访问凭证和端点URL（可选）。
    """
    # 检查文件是否存在
@@ -840,10 +867,19 @@ def main(standard_file, test_file, zip_file, badcase_path, overall_path,base_dat
    badcase_file,overall_file = generate_filename(badcase_path,overall_path)

    # 保存结果到JSON文件
-    save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    #save_results(result_dict, overall_report_dict,badcase_file,overall_file)
+    save_results(result_dict, overall_report_dict,badcase_file,overall_file,  s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)

    result=compare_edit_distance(base_data_path, overall_report_dict)
-    print(result)
+    """
+    if all([s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url]):
+        try:
+            upload_to_s3(badcase_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+            upload_to_s3(overall_file, s3_bucket_name, s3_file_directory, aws_access_key, aws_secret_key, end_point_url)
+        except Exception as e:
+            print(f"上传到S3时发生错误: {e}")    
+    """
+    #print(result)
    assert result == 1

 if __name__ == "__main__":
@@ -855,12 +891,12 @@ if __name__ == "__main__":
    parser.add_argument('overall_path', type=str, help='overall文件的基础路径和文件名前缀。')
    parser.add_argument('base_data_path', type=str, help='基准文件的基础路径和文件名前缀。')
    parser.add_argument('--s3_bucket_name', type=str, help='S3桶名称。', default=None)
-    parser.add_argument('--s3_file_name', type=str, help='S3上的文件名。', default=None)
+    parser.add_argument('--s3_file_directory', type=str, help='S3上的文件名。', default=None)
    parser.add_argument('--AWS_ACCESS_KEY', type=str, help='AWS访问密钥。', default=None)
    parser.add_argument('--AWS_SECRET_KEY', type=str, help='AWS秘密密钥。', default=None)
    parser.add_argument('--END_POINT_URL', type=str, help='AWS端点URL。', default=None)

    args = parser.parse_args()

-    main(args.standard_file, args.test_file, args.zip_file, args.badcase_path,args.overall_path,args.base_data_path,args.s3_bucket_name, args.s3_file_name, args.AWS_ACCESS_KEY, args.AWS_SECRET_KEY, args.END_POINT_URL)
+    main(args.standard_file, args.test_file, args.zip_file, args.badcase_path,args.overall_path,args.base_data_path,args.s3_bucket_name, args.s3_file_directory, args.AWS_ACCESS_KEY, args.AWS_SECRET_KEY, args.END_POINT_URL)

--- a/tools/text_badcase.py
+++ b/tools/text_badcase.py