Commit 07e4f115 authored by 赵小蒙's avatar 赵小蒙
Browse files

ocr_pdf_intermediate_dict_to_markdown_with_para输出nlp格式的markdown

parent bf8d8e21
...@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import ( ...@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_nlp_markdown, ocr_mk_nlp_markdown,
ocr_mk_mm_markdown, ocr_mk_mm_markdown,
ocr_mk_mm_standard_format, ocr_mk_mm_standard_format,
ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_nlp_markdown_with_para,
) )
from magic_pdf.libs.commons import ( from magic_pdf.libs.commons import (
read_file, read_file,
...@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) ...@@ -510,7 +510,8 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
pdf_intermediate_dict = jso["pdf_intermediate_dict"] pdf_intermediate_dict = jso["pdf_intermediate_dict"]
# 将 pdf_intermediate_dict 解压 # 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict) # markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
jso["content"] = markdown_content jso["content"] = markdown_content
logger.info( logger.info(
f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment