Feat/support footnote in figure (#532)

* feat: support figure footnote * feat: using the relative position to combine footnote, table, image * feat: add the readme of projects * fix: code spell in unittest --------- Co-authored-by: icecraft <xurui1@pjlab.org.cn>

Feat/support footnote in figure (#532)
* feat: support figure footnote * feat: using the relative position to combine footnote, table, image * feat: add the readme of projects * fix: code spell in unittest --------- Co-authored-by: icecraft <xurui1@pjlab.org.cn>
03469909 · icecraft · GitHub · 4331b837 · 03469909 · 03469909
Unverified Commit 03469909 authored Sep 03, 2024 by icecraft Committed by GitHub Sep 03, 2024
10 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,6 +3,7 @@ repos:
    rev: 5.0.4
    hooks:
      - id: flake8
+        args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"]
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
@@ -11,6 +12,7 @@ repos:
    rev: v0.32.0
    hooks:
      - id: yapf
+        args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"]
  - repo: https://github.com/codespell-project/codespell
    rev: v2.2.1
    hooks:
@@ -41,4 +43,4 @@ repos:
    rev: v1.3.1
    hooks:
      - id: docformatter
-        args: ["--in-place", "--wrap-descriptions", "79"]
+        args: ["--in-place", "--wrap-descriptions", "119"]
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
+import re
+import wordninja
 from loguru import logger
-from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.language import detect_lang
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.libs.ocr_content_type import BlockType, ContentType
-import wordninja
-import re
 def __is_hyphen_at_line_end(line):
@@ -37,8 +38,9 @@ def split_long_words(text):
 def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
    markdown = []
    for page_info in pdf_info_list:
-        paras_of_layout = page_info.get("para_blocks")
+        paras_of_layout = page_info.get('para_blocks')
-        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+        page_markdown = ocr_mk_markdown_with_para_core_v2(
+            paras_of_layout, 'mm', img_buket_path)
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)
@@ -46,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
 def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
    markdown = []
    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get("para_blocks")
+        paras_of_layout = page_info.get('para_blocks')
-        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
+        page_markdown = ocr_mk_markdown_with_para_core_v2(
+            paras_of_layout, 'nlp')
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)
-def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
+                                                img_buket_path):
    markdown_with_para_and_pagination = []
    page_no = 0
    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get("para_blocks")
+        paras_of_layout = page_info.get('para_blocks')
        if not paras_of_layout:
            continue
-        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+        page_markdown = ocr_mk_markdown_with_para_core_v2(
+            paras_of_layout, 'mm', img_buket_path)
        markdown_with_para_and_pagination.append({
-            'page_no': page_no,
+            'page_no':
-            'md_content': '\n\n'.join(page_markdown)
+            page_no,
+            'md_content':
+            '\n\n'.join(page_markdown)
        })
        page_no += 1
    return markdown_with_para_and_pagination
-def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
+def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
    page_markdown = []
    for paras in paras_of_layout:
        for para in paras:
@@ -81,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
                    if span_type == ContentType.Text:
                        content = span['content']
                        language = detect_lang(content)
-                        if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
+                        if (language == 'en'):  # 只对英文长词进行分词处理，中文分词会丢失文本
-                            content = ocr_escape_special_markdown_char(split_long_words(content))
+                            content = ocr_escape_special_markdown_char(
+                                split_long_words(content))
                        else:
                            content = ocr_escape_special_markdown_char(content)
                    elif span_type == ContentType.InlineEquation:
@@ -106,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
    return page_markdown
-def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
+def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
+                                      mode,
+                                      img_buket_path=''):
    page_markdown = []
    for para_block in paras_of_layout:
        para_text = ''
@@ -114,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
        if para_type == BlockType.Text:
            para_text = merge_para_with_text(para_block)
        elif para_type == BlockType.Title:
-            para_text = f"# {merge_para_with_text(para_block)}"
+            para_text = f'# {merge_para_with_text(para_block)}'
        elif para_type == BlockType.InterlineEquation:
            para_text = merge_para_with_text(para_block)
        elif para_type == BlockType.Image:
@@ -130,11 +140,13 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                for block in para_block['blocks']:  # 2nd.拼image_caption
                    if block['type'] == BlockType.ImageCaption:
                        para_text += merge_para_with_text(block)
+                for block in para_block['blocks']:  # 2nd.拼image_caption
+                    if block['type'] == BlockType.ImageFootnote:
+                        para_text += merge_para_with_text(block)
        elif para_type == BlockType.Table:
            if mode == 'nlp':
                continue
            elif mode == 'mm':
-                table_caption = ''
                for block in para_block['blocks']:  # 1st.拼table_caption
                    if block['type'] == BlockType.TableCaption:
                        para_text += merge_para_with_text(block)
@@ -163,6 +175,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 def merge_para_with_text(para_block):
    def detect_language(text):
        en_pattern = r'[a-zA-Z]+'
        en_matches = re.findall(en_pattern, text)
@@ -171,19 +184,19 @@ def merge_para_with_text(para_block):
            if en_length / len(text) >= 0.5:
                return 'en'
            else:
-                return "unknown"
+                return 'unknown'
        else:
-            return "empty"
+            return 'empty'
    para_text = ''
    for line in para_block['lines']:
-        line_text = ""
+        line_text = ''
-        line_lang = ""
+        line_lang = ''
        for span in line['spans']:
            span_type = span['type']
            if span_type == ContentType.Text:
                line_text += span['content'].strip()
-        if line_text != "":
+        if line_text != '':
            line_lang = detect_lang(line_text)
        for span in line['spans']:
            span_type = span['type']
@@ -193,7 +206,8 @@ def merge_para_with_text(para_block):
                # language = detect_lang(content)
                language = detect_language(content)
                if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                    content = ocr_escape_special_markdown_char(split_long_words(content))
+                    content = ocr_escape_special_markdown_char(
+                        split_long_words(content))
                else:
                    content = ocr_escape_special_markdown_char(content)
            elif span_type == ContentType.InlineEquation:
@@ -227,12 +241,13 @@ def para_to_standard_format(para, img_buket_path):
            for span in line['spans']:
                language = ''
                span_type = span.get('type')
-                content = ""
+                content = ''
                if span_type == ContentType.Text:
                    content = span['content']
                    language = detect_lang(content)
                    if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                        content = ocr_escape_special_markdown_char(split_long_words(content))
+                        content = ocr_escape_special_markdown_char(
+                            split_long_words(content))
                    else:
                        content = ocr_escape_special_markdown_char(content)
                elif span_type == ContentType.InlineEquation:
@@ -245,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
        para_content = {
            'type': 'text',
            'text': para_text,
-            'inline_equation_num': inline_equation_num
+            'inline_equation_num': inline_equation_num,
        }
    return para_content
@@ -256,37 +271,35 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
        para_content = {
            'type': 'text',
            'text': merge_para_with_text(para_block),
-            'page_idx': page_idx
+            'page_idx': page_idx,
        }
    elif para_type == BlockType.Title:
        para_content = {
            'type': 'text',
            'text': merge_para_with_text(para_block),
            'text_level': 1,
-            'page_idx': page_idx
+            'page_idx': page_idx,
        }
    elif para_type == BlockType.InterlineEquation:
        para_content = {
            'type': 'equation',
            'text': merge_para_with_text(para_block),
-            'text_format': "latex",
+            'text_format': 'latex',
-            'page_idx': page_idx
+            'page_idx': page_idx,
        }
    elif para_type == BlockType.Image:
-        para_content = {
+        para_content = {'type': 'image', 'page_idx': page_idx}
-            'type': 'image',
-            'page_idx': page_idx
-        }
        for block in para_block['blocks']:
            if block['type'] == BlockType.ImageBody:
-                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
+                para_content['img_path'] = join_path(
+                    img_buket_path,
+                    block['lines'][0]['spans'][0]['image_path'])
            if block['type'] == BlockType.ImageCaption:
                para_content['img_caption'] = merge_para_with_text(block)
+            if block['type'] == BlockType.ImageFootnote:
+                para_content['img_footnote'] = merge_para_with_text(block)
    elif para_type == BlockType.Table:
-        para_content = {
+        para_content = {'type': 'table', 'page_idx': page_idx}
-            'type': 'table',
-            'page_idx': page_idx
-        }
        for block in para_block['blocks']:
            if block['type'] == BlockType.TableBody:
                if block["lines"][0]["spans"][0].get('latex', ''):
@@ -305,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
 def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
    content_list = []
    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get("para_blocks")
+        paras_of_layout = page_info.get('para_blocks')
        if not paras_of_layout:
            continue
        for para_block in paras_of_layout:
-            para_content = para_to_standard_format_v2(para_block, img_buket_path)
+            para_content = para_to_standard_format_v2(para_block,
+                                                      img_buket_path)
            content_list.append(para_content)
    return content_list
 def line_to_standard_format(line, img_buket_path):
-    line_text = ""
+    line_text = ''
    inline_equation_num = 0
    for span in line['spans']:
        if not span.get('content'):
@@ -325,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
                if span['type'] == ContentType.Image:
                    content = {
                        'type': 'image',
-                        'img_path': join_path(img_buket_path, span['image_path'])
+                        'img_path': join_path(img_buket_path,
+                                              span['image_path']),
                    }
                    return content
                elif span['type'] == ContentType.Table:
                    content = {
                        'type': 'table',
-                        'img_path': join_path(img_buket_path, span['image_path'])
+                        'img_path': join_path(img_buket_path,
+                                              span['image_path']),
                    }
                    return content
        else:
@@ -339,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
                interline_equation = span['content']
                content = {
                    'type': 'equation',
-                    'latex': f"$$\n{interline_equation}\n$$"
+                    'latex': f'$$\n{interline_equation}\n$$'
                }
                return content
            elif span['type'] == ContentType.InlineEquation:
                inline_equation = span['content']
-                line_text += f"${inline_equation}$"
+                line_text += f'${inline_equation}$'
                inline_equation_num += 1
            elif span['type'] == ContentType.Text:
-                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
+                text_content = ocr_escape_special_markdown_char(
+                    span['content'])  # 转义特殊符号
                line_text += text_content
    content = {
        'type': 'text',
        'text': line_text,
-        'inline_equation_num': inline_equation_num
+        'inline_equation_num': inline_equation_num,
    }
    return content
 def ocr_mk_mm_standard_format(pdf_info_dict: list):
-    """
+    """content_list type         string
-    content_list
+    image/text/table/equation(行间的单独拿出来，行内的和text合并) latex        string
-    type         string      image/text/table/equation(行间的单独拿出来，行内的和text合并)
+    latex文本字段。 text         string      纯文本格式的文本数据。 md           string
-    latex        string      latex文本字段。
+    markdown格式的文本数据。 img_path     string      s3://full/path/to/img.jpg."""
-    text         string      纯文本格式的文本数据。
-    md           string      markdown格式的文本数据。
-    img_path     string      s3://full/path/to/img.jpg
-    """
    content_list = []
    for page_info in pdf_info_dict:
-        blocks = page_info.get("preproc_blocks")
+        blocks = page_info.get('preproc_blocks')
        if not blocks:
            continue
        for block in blocks:
@@ -378,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
    return content_list
-def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
+def union_make(pdf_info_dict: list,
+               make_mode: str,
+               drop_mode: str,
+               img_buket_path: str = ''):
    output_content = []
    for page_info in pdf_info_dict:
-        if page_info.get("need_drop", False):
+        if page_info.get('need_drop', False):
-            drop_reason = page_info.get("drop_reason")
+            drop_reason = page_info.get('drop_reason')
            if drop_mode == DropMode.NONE:
                pass
            elif drop_mode == DropMode.WHOLE_PDF:
-                raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
+                raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
+                                 f'drop_reason is {drop_reason}'))
            elif drop_mode == DropMode.SINGLE_PAGE:
-                logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
+                logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
+                                f'drop_reason is {drop_reason}'))
                continue
            else:
-                raise Exception(f"drop_mode can not be null")
+                raise Exception('drop_mode can not be null')
-        paras_of_layout = page_info.get("para_blocks")
+        paras_of_layout = page_info.get('para_blocks')
-        page_idx = page_info.get("page_idx")
+        page_idx = page_info.get('page_idx')
        if not paras_of_layout:
            continue
        if make_mode == MakeMode.MM_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+            page_markdown = ocr_mk_markdown_with_para_core_v2(
+                paras_of_layout, 'mm', img_buket_path)
            output_content.extend(page_markdown)
        elif make_mode == MakeMode.NLP_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
+            page_markdown = ocr_mk_markdown_with_para_core_v2(
+                paras_of_layout, 'nlp')
            output_content.extend(page_markdown)
        elif make_mode == MakeMode.STANDARD_FORMAT:
            for para_block in paras_of_layout:
-                para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
+                para_content = para_to_standard_format_v2(
+                    para_block, img_buket_path, page_idx)
                output_content.append(para_content)
    if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
        return '\n\n'.join(output_content)

--- a/magic_pdf/layout/layout_sort.py
+++ b/magic_pdf/layout/layout_sort.py
--- a/magic_pdf/libs/boxbase.py
+++ b/magic_pdf/libs/boxbase.py
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -71,6 +71,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
    tables_list, tables_body_list = [], []
    tables_caption_list, tables_footnote_list = [], []
    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
+    imgs_footnote_list = []
    titles_list = []
    texts_list = []
    interequations_list = []
@@ -78,7 +79,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
        page_layout_list = []
        page_dropped_list = []
        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
-        imgs, imgs_body, imgs_caption = [], [], []
+        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
        titles = []
        texts = []
        interequations = []
@@ -108,6 +109,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
                        imgs_body.append(bbox)
                    elif nested_block['type'] == BlockType.ImageCaption:
                        imgs_caption.append(bbox)
+                    elif nested_block['type'] == BlockType.ImageFootnote:
+                        imgs_footnote.append(bbox)
            elif block['type'] == BlockType.Title:
                titles.append(bbox)
            elif block['type'] == BlockType.Text:
@@ -121,6 +124,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
        imgs_list.append(imgs)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
+        imgs_footnote_list.append(imgs_footnote)
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
@@ -142,6 +146,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
                                 True)
+        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
+                              True),
        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
@@ -241,7 +247,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
 def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
    dropped_bbox_list = []
    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
-    imgs_body_list, imgs_caption_list = [], []
+    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
    titles_list = []
    texts_list = []
    interequations_list = []
@@ -250,7 +256,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
    for i in range(len(model_list)):
        page_dropped_list = []
        tables_body, tables_caption, tables_footnote = [], [], []
-        imgs_body, imgs_caption = [], []
+        imgs_body, imgs_caption, imgs_footnote = [], [], []
        titles = []
        texts = []
        interequations = []
@@ -277,6 +283,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
                interequations.append(bbox)
            elif layout_det['category_id'] == CategoryId.Abandon:
                page_dropped_list.append(bbox)
+            elif layout_det['category_id'] == CategoryId.ImageFootnote:
+                imgs_footnote.append(bbox)
        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
@@ -287,6 +295,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
        texts_list.append(texts)
        interequations_list.append(interequations)
        dropped_bbox_list.append(page_dropped_list)
+        imgs_footnote_list.append(imgs_footnote)
    for i, page in enumerate(pdf_docs):
        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
@@ -299,6 +308,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
                              True)
+        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
+                              True)
        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)

--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
 class ContentType:
-    Image = "image"
+    Image = 'image'
-    Table = "table"
+    Table = 'table'
-    Text = "text"
+    Text = 'text'
-    InlineEquation = "inline_equation"
+    InlineEquation = 'inline_equation'
-    InterlineEquation = "interline_equation"
+    InterlineEquation = 'interline_equation'
 class BlockType:
-    Image = "image"
+    Image = 'image'
-    ImageBody = "image_body"
+    ImageBody = 'image_body'
-    ImageCaption = "image_caption"
+    ImageCaption = 'image_caption'
-    Table = "table"
+    ImageFootnote = 'image_footnote'
-    TableBody = "table_body"
+    Table = 'table'
-    TableCaption = "table_caption"
+    TableBody = 'table_body'
-    TableFootnote = "table_footnote"
+    TableCaption = 'table_caption'
-    Text = "text"
+    TableFootnote = 'table_footnote'
-    Title = "title"
+    Text = 'text'
-    InterlineEquation = "interline_equation"
+    Title = 'title'
-    Footnote = "footnote"
+    InterlineEquation = 'interline_equation'
-    Discarded = "discarded"
+    Footnote = 'footnote'
+    Discarded = 'discarded'
 class CategoryId:
@@ -33,3 +35,4 @@ class CategoryId:
    InlineEquation = 13
    InterlineEquation_YOLO = 14
    OcrText = 15
+    ImageFootnote = 101
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
-from loguru import logger
+from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
+                                    _is_in_or_part_overlap_with_area_ratio,
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
+                                    calculate_overlap_area_in_bbox1_area_ratio)
-    calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
 from magic_pdf.libs.drop_tag import DropTag
-from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.libs.ocr_content_type import BlockType, ContentType
-from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
-from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
 # 将每一个line中的span从左到右排序
 def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
-        # 按照x0坐标排序
+        #  按照x0坐标排序
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
@@ -21,8 +18,8 @@ def line_sort_spans_by_left_to_right(lines):
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
-            "bbox": line_bbox,
+            'bbox': line_bbox,
-            "spans": line,
+            'spans': line,
        })
    return line_objects
@@ -39,16 +36,21 @@ def merge_spans_to_line(spans):
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型，同上
-            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+            if span['type'] in [
-                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
+                    ContentType.InterlineEquation, ContentType.Image,
-                    current_line):
+                    ContentType.Table
+            ] or any(s['type'] in [
+                    ContentType.InterlineEquation, ContentType.Image,
+                    ContentType.Table
+            ] for s in current_line):
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
-            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+            if __is_overlaps_y_exceeds_threshold(span['bbox'],
+                                                 current_line[-1]['bbox']):
                current_line.append(span)
            else:
                # 否则，开始新行
@@ -71,7 +73,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
-            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
+            if calculate_overlap_area_in_bbox1_area_ratio(
+                    span['bbox'], layout_bbox) > 0.6:
                layout_sapns.append(span)
        # 如果layout_sapns不为空，则放入new_spans中
        if len(layout_sapns) > 0:
@@ -99,12 +102,10 @@ def merge_lines_to_block(lines):
    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
    blocks = []
    for line in lines:
-        blocks.append(
+        blocks.append({
-            {
+            'bbox': line['bbox'],
-                "bbox": line["bbox"],
+            'lines': [line],
-                "lines": [line],
+        })
-            }
-        )
    return blocks
@@ -121,7 +122,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
            if block[7] == BlockType.Footnote:
                continue
            block_bbox = block[:4]
-            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
+            if calculate_overlap_area_in_bbox1_area_ratio(
+                    block_bbox, layout_bbox) > 0.8:
                layout_blocks.append(block)
        # 如果layout_blocks不为空，则放入new_blocks中
@@ -134,7 +136,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
    # 如果new_blocks不为空，则对new_blocks中每个block进行排序
    if len(new_blocks) > 0:
        for bboxes_in_layout_block in new_blocks:
-            bboxes_in_layout_block.sort(key=lambda x: x[1])  # 一个layout内部的box，按照y0自上而下排序
+            bboxes_in_layout_block.sort(
+                key=lambda x: x[1])  # 一个layout内部的box，按照y0自上而下排序
            sort_blocks.extend(bboxes_in_layout_block)
    # sort_blocks中已经包含了当前页面所有最终留下的block，且已经排好了顺序
@@ -142,9 +145,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
 def fill_spans_in_blocks(blocks, spans, radio):
-    '''
+    """将allspans中的span按位置关系，放入blocks中."""
-    将allspans中的span按位置关系，放入blocks中
-    '''
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
@@ -156,17 +157,15 @@ def fill_spans_in_blocks(blocks, spans, radio):
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
-            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
+            if calculate_overlap_area_in_bbox1_area_ratio(
+                    span_bbox, block_bbox) > radio:
                block_spans.append(span)
        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
        # displayed_list = []
        # text_inline_lines = []
        # modify_y_axis(block_spans, displayed_list, text_inline_lines)
        '''模型识别错误的行间公式, type类型转换成行内公式'''
        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
        '''bbox去除粘连'''  # 去粘连会影响span的bbox，导致后续fill的时候出错
        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
@@ -182,12 +181,9 @@ def fill_spans_in_blocks(blocks, spans, radio):
 def fix_block_spans(block_with_spans, img_blocks, table_blocks):
-    '''
+    """1、img_block和table_block因为包含caption和footnote的关系，存在block的嵌套关系
-    1、img_block和table_block因为包含caption和footnote的关系，存在block的嵌套关系
+    需要将caption和footnote的text_span放入相应img_block和table_block内的
-        需要将caption和footnote的text_span放入相应img_block和table_block内的
+    caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
-        caption_block和footnote_block中
-    2、同时需要删除block中的spans字段
-    '''
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['type']
@@ -218,16 +214,13 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
    block_spans = []
    # 如果有img_caption，则将img_block中的text_spans放入img_caption_block中
    for span in spans:
-        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6:
+        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
+                                                      block_bbox) > 0.6:
            block_spans.append(span)
    block_lines = merge_spans_to_line(block_spans)
    # 对line中的span进行排序
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
-    block = {
+    block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
-        'bbox': block_bbox,
-        'type': block_type,
-        'lines': sort_block_lines
-    }
    return block, block_spans
@@ -237,11 +230,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
        'bbox': block_bbox,
        'spans': [span],
    }
-    body_block = {
+    body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
-        'bbox': block_bbox,
-        'type': block_type,
-        'lines': [body_line]
-    }
    return body_block
@@ -249,13 +238,16 @@ def fix_image_block(block, img_blocks):
    block['blocks'] = []
    # 遍历img_blocks,找到与当前block匹配的img_block
    for img_block in img_blocks:
-        if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95):
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
+                                                  img_block['bbox'], 0.95):
            # 创建img_body_block
            for span in block['spans']:
-                if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']:
+                if span['type'] == ContentType.Image and img_block[
+                        'img_body_bbox'] == span['bbox']:
                    # 创建img_body_block
-                    img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
+                    img_body_block = make_body_block(
+                        span, img_block['img_body_bbox'], BlockType.ImageBody)
                    block['blocks'].append(img_body_block)
                    # 从spans中移除img_body_block中已经放入的span
@@ -265,10 +257,15 @@ def fix_image_block(block, img_blocks):
            # 根据list长度，判断img_block中是否有img_caption
            if img_block['img_caption_bbox'] is not None:
                img_caption_block, img_caption_spans = merge_spans_to_block(
-                    block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
+                    block['spans'], img_block['img_caption_bbox'],
-                )
+                    BlockType.ImageCaption)
                block['blocks'].append(img_caption_block)
+            if img_block['img_footnote_bbox'] is not None:
+                img_footnote_block, img_footnote_spans = merge_spans_to_block(
+                    block['spans'], img_block['img_footnote_bbox'],
+                    BlockType.ImageFootnote)
+                block['blocks'].append(img_footnote_block)
            break
    del block['spans']
    return block
@@ -278,13 +275,17 @@ def fix_table_block(block, table_blocks):
    block['blocks'] = []
    # 遍历table_blocks,找到与当前block匹配的table_block
    for table_block in table_blocks:
-        if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95):
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
+                                                  table_block['bbox'], 0.95):
            # 创建table_body_block
            for span in block['spans']:
-                if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']:
+                if span['type'] == ContentType.Table and table_block[
+                        'table_body_bbox'] == span['bbox']:
                    # 创建table_body_block
-                    table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
+                    table_body_block = make_body_block(
+                        span, table_block['table_body_bbox'],
+                        BlockType.TableBody)
                    block['blocks'].append(table_body_block)
                    # 从spans中移除img_body_block中已经放入的span
@@ -294,8 +295,8 @@ def fix_table_block(block, table_blocks):
            # 根据list长度，判断table_block中是否有caption
            if table_block['table_caption_bbox'] is not None:
                table_caption_block, table_caption_spans = merge_spans_to_block(
-                    block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
+                    block['spans'], table_block['table_caption_bbox'],
-                )
+                    BlockType.TableCaption)
                block['blocks'].append(table_caption_block)
                # 如果table_caption_block_spans不为空
@@ -307,8 +308,8 @@ def fix_table_block(block, table_blocks):
            # 根据list长度，判断table_block中是否有table_note
            if table_block['table_footnote_bbox'] is not None:
                table_footnote_block, table_footnote_spans = merge_spans_to_block(
-                    block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
+                    block['spans'], table_block['table_footnote_bbox'],
-                )
+                    BlockType.TableFootnote)
                block['blocks'].append(table_footnote_block)
            break

--- a/projects/README.md
+++ b/projects/README.md
+# 欢迎来到 MinerU 项目列表
+## 项目列表
+- [llama_index_rag](./llama_index_rag/README.md): 基于 llama_index 构建轻量级 RAG 系统
--- a/tests/test_unit.py
+++ b/tests/test_unit.py