fix(ocr): improve image and table content extraction

- Update image content extraction to iterate through all spans in a block - Add support for extracting table content from spans within a block - Handle multiple content types within table spans (latex, html, image) - Refactor code to be more modular and easier to maintain

fix(ocr): improve image and table content extraction
- Update image content extraction to iterate through all spans in a block - Add support for extracting table content from spans within a block - Handle multiple content types within table spans (latex, html, image) - Refactor code to be more modular and easier to maintain
b7e9d454 · myhloli · 4c412b28 · b7e9d454
Commit b7e9d454 authored Oct 30, 2024 by myhloli
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 4 deletions

magic_pdf/dict2md/ocr_mkcontent.py magic_pdf/dict2md/ocr_mkcontent.py +15 -4

No files found.
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -183,9 +183,10 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
        para_content = {'type': 'image', 'img_caption': [], 'img_footnote': []}
        for block in para_block['blocks']:
            if block['type'] == BlockType.ImageBody:
-                para_content['img_path'] = join_path(
-                    img_buket_path,
-                    block['lines'][0]['spans'][0]['image_path'])
+                for line in block['lines']:
+                    for span in line['spans']:
+                        if span['type'] == ContentType.Image:
+                            para_content['img_path'] = join_path(img_buket_path, span['image_path'])
            if block['type'] == BlockType.ImageCaption:
                para_content['img_caption'].append(merge_para_with_text(block))
            if block['type'] == BlockType.ImageFootnote:
@@ -194,11 +195,21 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
        para_content = {'type': 'table', 'table_caption': [], 'table_footnote': []}
        for block in para_block['blocks']:
            if block['type'] == BlockType.TableBody:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        if span['type'] == ContentType.Table:
+
+                            if span.get('latex', ''):
+                                para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
+                            elif span.get('html', ''):
+                                para_content['table_body'] = f"\n\n{span['html']}\n\n"
+
+                            if span.get('image_path', ''):
+                                para_content['img_path'] = join_path(img_buket_path, span['image_path'])
                if block["lines"][0]["spans"][0].get('latex', ''):
                    para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
                elif block["lines"][0]["spans"][0].get('html', ''):
                    para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
-                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
            if block['type'] == BlockType.TableCaption:
                para_content['table_caption'].append(merge_para_with_text(block))
            if block['type'] == BlockType.TableFootnote: