Commit 7d010e19 authored by 赵小蒙's avatar 赵小蒙
Browse files

ocr_mk_mm_markdown_with_para和ocr_mk_mm_markdown_with_para_and_pagination逻辑优化

parent dbe79ba1
......@@ -72,36 +72,27 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
for paras in paras_of_layout:
for para in paras:
para_text = ''
for line in para:
for span in line['spans']:
span_type = span.get('type')
if span_type == ContentType.Text:
content = split_long_words(span['content'])
pass
elif span_type == ContentType.InlineEquation:
content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ ContentType.Image, ContentType.Table ]:
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
para_text += content + ' '
markdown.append(para_text.strip() + ' ')
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout)
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items():
page_markdown = []
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout)
markdown_with_para_and_pagination.append({
'page_no': page_no,
'md_content': '\n\n'.join(page_markdown)
})
return markdown_with_para_and_pagination
def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
page_markdown = []
for paras in paras_of_layout:
for para in paras:
para_text = ''
......@@ -119,11 +110,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
para_text += content + ' '
page_markdown.append(para_text.strip() + ' ')
markdown_with_para_and_pagination.append({
'page_no': page_no,
'md_content': '\n\n'.join(page_markdown)
})
return markdown_with_para_and_pagination
return page_markdown
def make_standard_format_with_para(pdf_info_dict: dict):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment