pipeline调整

27c080a9 · 赵小蒙 · b94fd7f0 · 27c080a9
Commit 27c080a9 authored Mar 21, 2024 by 赵小蒙
Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 1 deletion

magic_pdf/pipeline.py magic_pdf/pipeline.py +31 -1

No files found.
--- a/magic_pdf/pipeline.py
+++ b/magic_pdf/pipeline.py
@@ -496,6 +496,35 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
    return jso


+def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict:
+
+    if debug_mode:
+        pass
+    else:  # 如果debug没开，则检测是否有needdrop字段
+        if jso.get("need_drop", False):
+            book_name = join_path(get_data_source(jso), jso["file_id"])
+            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
+            jso["dropped"] = True
+            return jso
+    try:
+        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
+        # 将 pdf_intermediate_dict 解压
+        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
+        markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
+        jso["content"] = markdown_content
+        logger.info(
+            f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
+            file=sys.stderr,
+        )
+        # 把无用的信息清空
+        jso["doc_layout_result"] = ""
+        jso["pdf_intermediate_dict"] = ""
+        jso["pdf_meta"] = ""
+    except Exception as e:
+        jso = exception_handler(jso, e)
+    return jso
+
+
 def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
    jso: dict, debug_mode=False
 ) -> dict:
@@ -520,7 +549,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
        )
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
-        jso["pdf_intermediate_dict"] = pdf_intermediate_dict
+        jso["pdf_intermediate_dict"] = ""
+        jso["mid_json_ocr"] = pdf_intermediate_dict
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)