Commit 3f93b895 authored by myhloli's avatar myhloli
Browse files

feat(pdf_parse): add internal block sorting for images and tables

- Implement block sorting within image and table blocks
- Ensure correct order of captions and footnotes within blocks
- Improve overall document structure and parsing accuracy
parent a935c33f
...@@ -768,6 +768,11 @@ def parse_page_core( ...@@ -768,6 +768,11 @@ def parse_page_core(
"""重排block""" """重排block"""
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index']) sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
"""block内重排(img和table的block内多个caption或footnote的排序)"""
for block in sorted_blocks:
if block['type'] in [BlockType.Image, BlockType.Table]:
block['blocks'] = sorted(block['blocks'], key=lambda b: b['index'])
"""获取QA需要外置的list""" """获取QA需要外置的list"""
images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks) images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment