Unverified Commit b122b86e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2487 from myhloli/dev

fix(ocr_mkcontent): improve image handling and footnote integration in markdown output
parents e3f22e84 002333a8
...@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
for block in para_block['blocks']: # 1st.拼image_body # 检测是否存在图片脚注
if block['type'] == BlockType.ImageBody: has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
for line in block['lines']: # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
for span in line['spans']: if has_image_footnote:
if span['type'] == ContentType.Image: for block in para_block['blocks']: # 1st.拼image_caption
if span.get('image_path', ''): if block['type'] == BlockType.ImageCaption:
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 2nd.拼image_body
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageBody:
para_text += merge_para_with_text(block) + ' \n' for line in block['lines']:
for block in para_block['blocks']: # 3rd.拼image_footnote for span in line['spans']:
if block['type'] == BlockType.ImageFootnote: if span['type'] == ContentType.Image:
para_text += merge_para_with_text(block) + ' \n' if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼image_footnote
if block['type'] == BlockType.ImageFootnote:
para_text += ' \n' + merge_para_with_text(block)
else:
for block in para_block['blocks']: # 1st.拼image_body
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption:
para_text += ' \n' + merge_para_with_text(block)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
if mode == 'nlp': if mode == 'nlp':
continue continue
...@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
for span in line['spans']: for span in line['spans']:
if span['type'] == ContentType.Table: if span['type'] == ContentType.Table:
# if processed by table model # if processed by table model
if span.get('latex', ''): if span.get('html', ''):
para_text += f"\n\n$\n {span['latex']}\n$\n\n" para_text += f"\n{span['html']}\n"
elif span.get('html', ''):
para_text += f"\n\n{span['html']}\n\n"
elif span.get('image_path', ''): elif span.get('image_path', ''):
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼table_footnote for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote: if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block) + ' \n' para_text += '\n' + merge_para_with_text(block) + ' '
if para_text.strip() == '': if para_text.strip() == '':
continue continue
else: else:
page_markdown.append(para_text.strip() + ' ') # page_markdown.append(para_text.strip() + ' ')
page_markdown.append(para_text.strip())
return page_markdown return page_markdown
...@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason ...@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
if span['type'] == ContentType.Table: if span['type'] == ContentType.Table:
if span.get('latex', ''): if span.get('latex', ''):
para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n" para_content['table_body'] = f"{span['latex']}"
elif span.get('html', ''): elif span.get('html', ''):
para_content['table_body'] = f"\n\n{span['html']}\n\n" para_content['table_body'] = f"{span['html']}"
if span.get('image_path', ''): if span.get('image_path', ''):
para_content['img_path'] = join_path(img_buket_path, span['image_path']) para_content['img_path'] = join_path(img_buket_path, span['image_path'])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment