feat: add extract_train_data

09269c84 · 许瑞 · 056aed86 · 09269c84 · 09269c84 · 09269c84
Commit 09269c84 authored Mar 20, 2024 by 许瑞
7 changed files
--- a/magic_pdf/pdf_parse_for_train.py
+++ b/magic_pdf/pdf_parse_for_train.py
--- a/magic_pdf/pipeline.py
+++ b/magic_pdf/pipeline.py
--- a/magic_pdf/train_utils/__init__.py
+++ b/magic_pdf/train_utils/__init__.py
--- a/magic_pdf/train_utils/convert_to_train_format.py
+++ b/magic_pdf/train_utils/convert_to_train_format.py
+
+
+def convert_to_train_format(jso: dict) -> []:
+    pages = []
+    for k, v in jso.items():
+        page_idx = v["page_idx"]
+        width, height = v["page_size"]
+
+        info = {"page_info": {"page_no": page_idx, "height": height, "width": width}}
+
+        bboxes: list[dict] = []
+        for img_bbox in v["image_bboxes_with_caption"]:
+            bbox = {"category_id": 1, "bbox": img_bbox["bbox"]}
+            if "caption" in img_bbox:
+                bbox["caption_bbox"] = img_bbox["caption"]
+            bboxes.append(bbox)
+
+        for tbl_bbox in v["table_bboxes_with_caption"]:
+            bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]}
+            if "caption" in tbl_bbox:
+                bbox["caption_bbox"] = tbl_bbox["caption"]
+            bboxes.append(bbox)
+
+        for bbox in v["bak_page_no_bboxes"]:
+            n_bbox = {"category_id": 4, "bbox": bbox}
+            bboxes.append(n_bbox)
+
+        for bbox in v["bak_header_bboxes"]:
+            n_bbox = {"category_id": 3, "bbox": bbox}
+            bboxes.append(n_bbox)
+
+        for bbox in v["bak_footer_bboxes"]:
+            n_bbox = {"category_id": 6, "bbox": bbox}
+            bboxes.append(n_bbox)
+
+        # 脚注， 目前没有看到例子
+        for para in v["para_blocks"]:
+            n_bbox = {"category_id": 2, "bbox": para["bbox"]}
+            bboxes.append(n_bbox)
+
+        for inline_equation in v["inline_equations"]:
+            n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
+            bboxes.append(n_bbox)
+
+        for inter_equation in v["interline_equations"]:
+            n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
+            bboxes.append(n_bbox)
+
+        info["bboxes"] = bboxes
+        pages.append(info)
+
+    return pages
--- a/magic_pdf/train_utils/extract_caption.py
+++ b/magic_pdf/train_utils/extract_caption.py
+from magic_pdf.libs.boxbase import _is_in
+
+
+def extract_caption_bbox(outer: list, inner: list) -> list:
+    """
+    ret: list of {
+                    "bbox": [1,2,3,4],
+                    "caption": [5,6,7,8] # may existed
+                }
+
+    """
+    found_count = 0  # for debug
+    print(outer, inner)
+
+    def is_float_equal(a, b):
+        if 0.01 > abs(a - b):  # non strict float equal compare
+            return True
+        return False
+
+    outer_h = {i: outer[i] for i in range(len(outer))}
+    ret = []
+    for v in inner:
+        ix0, iy0, ix1, iy1 = v
+        found_idx = None
+        d = {"bbox": v[:4]}
+        for k in outer_h:
+            ox0, oy0, ox1, oy1 = outer_h[k]
+            equal_float_flags = [
+                is_float_equal(ix0, ox0),
+                is_float_equal(iy0, oy0),
+                is_float_equal(ix1, ox1),
+                is_float_equal(iy1, oy1),
+            ]
+            if _is_in(v, outer_h[k]) and not all(equal_float_flags):
+                found_idx = k
+                break
+        if found_idx is not None:
+            found_count += 1
+            captions: list[list] = []
+            ox0, oy0, ox1, oy1 = outer_h[found_idx]
+            captions = [
+                [ox0, oy0, ix0, oy1],
+                [ox0, oy0, ox1, iy0],
+                [ox0, iy1, ox1, oy1],
+                [ix1, oy0, ox1, oy1],
+            ]
+            captions = sorted(
+                captions,
+                key=lambda rect: abs(rect[0] - rect[2]) * abs(rect[1] - rect[3]),
+            )  # 面积最大的框就是caption
+            d["caption"] = captions[-1]
+            outer_h.pop(
+                found_idx
+            )  # 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。
+
+        ret.append(d)
+
+    print("found_count: ", found_count)
+    return ret
--- a/magic_pdf/train_utils/remove_footer_header.py
+++ b/magic_pdf/train_utils/remove_footer_header.py
+import re
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap
+from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
+
+
+"""
+    copy from pre_proc/remove_footer_header.py
+"""
+
+
+def remove_headder_footer_one_page(
+    text_raw_blocks,
+    image_bboxes,
+    table_bboxes,
+    header_bboxs,
+    footer_bboxs,
+    page_no_bboxs,
+    page_w,
+    page_h,
+):
+    """
+    删除页眉页脚，页码
+    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
+    """
+    if 1:
+        return image_bboxes, table_bboxes, text_raw_blocks, [], [], []
+
+    header = []
+    footer = []
+    if len(header) == 0:
+        model_header = header_bboxs
+        if model_header:
+            x0 = min([x for x, _, _, _ in model_header])
+            y0 = min([y for _, y, _, _ in model_header])
+            x1 = max([x1 for _, _, x1, _ in model_header])
+            y1 = max([y1 for _, _, _, y1 in model_header])
+            header = [x0, y0, x1, y1]
+    if len(footer) == 0:
+        model_footer = footer_bboxs
+        if model_footer:
+            x0 = min([x for x, _, _, _ in model_footer])
+            y0 = min([y for _, y, _, _ in model_footer])
+            x1 = max([x1 for _, _, x1, _ in model_footer])
+            y1 = max([y1 for _, _, _, y1 in model_footer])
+            footer = [x0, y0, x1, y1]
+
+    header_y0 = 0 if len(header) == 0 else header[3]
+    footer_y0 = page_h if len(footer) == 0 else footer[1]
+    if page_no_bboxs:
+        top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
+        btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
+
+        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
+        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
+
+        header_y0 = max(header_y0, top_max_y0)
+        footer_y0 = min(footer_y0, btn_min_y1)
+
+    content_boundry = [0, header_y0, page_w, footer_y0]
+
+    header = [0, 0, page_w, header_y0]
+    footer = [0, footer_y0, page_w, page_h]
+
+    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
+    text_block_to_remove = []
+    # 首先检查每个textblock
+    for blk in text_raw_blocks:
+        if len(blk["lines"]) > 0:
+            for line in blk["lines"]:
+                line_del = []
+                for span in line["spans"]:
+                    span_del = []
+                    if span["bbox"][3] < header_y0:
+                        span_del.append(span)
+                    elif _is_in_or_part_overlap(
+                        span["bbox"], header
+                    ) or _is_in_or_part_overlap(span["bbox"], footer):
+                        span_del.append(span)
+                for span in span_del:
+                    line["spans"].remove(span)
+                if not line["spans"]:
+                    line_del.append(line)
+
+            for line in line_del:
+                blk["lines"].remove(line)
+        else:
+            # if not blk['lines']:
+            blk["tag"] = CONTENT_IN_FOOT_OR_HEADER
+            text_block_to_remove.append(blk)
+
+    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
+    page_no_block_2_remove = []
+    if page_no_bboxs:
+        for pagenobox in page_no_bboxs:
+            for block in text_raw_blocks:
+                if _is_in_or_part_overlap(
+                    pagenobox, block["bbox"]
+                ):  # 在span级别删除页码
+                    for line in block["lines"]:
+                        for span in line["spans"]:
+                            if _is_in_or_part_overlap(pagenobox, span["bbox"]):
+                                # span['text'] = ''
+                                span["tag"] = PAGE_NO
+                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
+                                if len(line["spans"]) == 1 and len(block["lines"]) == 1:
+                                    page_no_block_2_remove.append(block)
+    else:
+        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
+        if len(text_raw_blocks) > 0:
+            text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True)
+            last_block = text_raw_blocks[0]
+            if len(last_block["lines"]) == 1:
+                last_line = last_block["lines"][0]
+                if len(last_line["spans"]) == 1:
+                    last_span = last_line["spans"][0]
+                    if (
+                        last_span["text"].strip()
+                        and not re.search("[a-zA-Z]", last_span["text"])
+                        and re.search("[0-9]", last_span["text"])
+                    ):
+                        last_span["tag"] = PAGE_NO
+                        page_no_block_2_remove.append(last_block)
+
+    for b in page_no_block_2_remove:
+        text_block_to_remove.append(b)
+
+    for blk in text_block_to_remove:
+        if blk in text_raw_blocks:
+            text_raw_blocks.remove(blk)
+
+    text_block_remain = text_raw_blocks
+    image_bbox_to_remove = [
+        bbox
+        for bbox in image_bboxes
+        if not _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+
+    image_bbox_remain = [
+        bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+    table_bbox_to_remove = [
+        bbox
+        for bbox in table_bboxes
+        if not _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+    table_bbox_remain = [
+        bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+
+    #        1,                 2,                3
+    return (
+        image_bbox_remain,
+        table_bbox_remain,
+        text_block_remain,
+        text_block_to_remove,
+        image_bbox_to_remove,
+        table_bbox_to_remove,
+    )
--- a/magic_pdf/train_utils/vis_utils.py
+++ b/magic_pdf/train_utils/vis_utils.py
+from magic_pdf.libs.commons import fitz
+import os
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def draw_model_output(
+    raw_pdf_doc: fitz.Document, paras_dict_arr: list[dict], save_path: str
+):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    """
+    
+        # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    
+    """
+
+    color_map = {
+        "body": fitz.pdfcolor["green"],
+        "non_body": fitz.pdfcolor["red"],
+    }
+    """
+    {"layout_dets": [], "subfield_dets": [], "page_info": {"page_no": 22, "height": 1650, "width": 1275}}
+    """
+    for i, page in enumerate(raw_pdf_doc):
+        v = paras_dict_arr[i]
+        page_idx = v["page_info"]["page_no"]
+        width = v["page_info"]["width"]
+        height = v["page_info"]["height"]
+
+        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
+            paras_dict_arr[i], page
+        )
+
+        for order, block in enumerate(v["layout_dets"]):
+            L = block["poly"][0] / horizontal_scale_ratio
+            U = block["poly"][1] / vertical_scale_ratio
+            R = block["poly"][2] / horizontal_scale_ratio
+            D = block["poly"][5] / vertical_scale_ratio
+            # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+            # R += pageL
+            # U += pageU
+            # D += pageU
+            L, R = min(L, R), max(L, R)
+            U, D = min(U, D), max(U, D)
+            bbox = [L, U, R, D]
+            color = color_map["body"]
+            if block["category_id"] in (3, 4, 5, 6, 0):
+                color = color_map["non_body"]
+
+            rect = fitz.Rect(bbox)
+            page.draw_rect(rect, fill=None, width=0.5, overlay=True, color=color)
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+    raw_pdf_doc.save(save_path)
+
+
+def debug_show_bbox(
+    raw_pdf_doc: fitz.Document,
+    page_idx: int,
+    bboxes: list,
+    droped_bboxes: list,
+    expect_drop_bboxes: list,
+    save_path: str,
+    expected_page_id: int,
+):
+    """
+    以覆盖的方式写个临时的pdf，用于debug
+    """
+    if page_idx != expected_page_id:
+        return
+
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open("")
+
+    width = raw_pdf_doc[page_idx].rect.width
+    height = raw_pdf_doc[page_idx].rect.height
+    new_page = doc.new_page(width=width, height=height)
+
+    shape = new_page.new_shape()
+    for bbox in bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(
+            color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2
+        )
+        shape.finish()
+        shape.commit()
+
+    for bbox in droped_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+
+    for bbox in expect_drop_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor["red"], fill=None)
+        shape.finish()
+        shape.commit()
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close()
+
+
+def debug_show_page(
+    page,
+    bboxes1: list,
+    bboxes2: list,
+    bboxes3: list,
+):
+    save_path = "./tmp/debug.pdf"
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open("")
+
+    width = page.rect.width
+    height = page.rect.height
+    new_page = doc.new_page(width=width, height=height)
+
+    shape = new_page.new_shape()
+    for bbox in bboxes1:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(
+            color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2
+        )
+        shape.finish()
+        shape.commit()
+
+    for bbox in bboxes2:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+
+    for bbox in bboxes3:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor["red"], fill=None)
+        shape.finish()
+        shape.commit()
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close()
+
+
+def draw_layout_bbox_on_page(
+    raw_pdf_doc: fitz.Document, paras_dict: dict, header, footer, pdf_path: str
+):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open("")
+
+    for k, v in paras_dict.items():
+        page_idx = v["page_idx"]
+        layouts = v["layout_bboxes"]
+        page = doc[page_idx]
+        shape = page.new_shape()
+        for order, layout in enumerate(layouts):
+            border_offset = 1
+            rect_box = layout["layout_bbox"]
+            layout_label = layout["layout_label"]
+            fill_color = fitz.pdfcolor["pink"] if layout_label == "U" else None
+            rect_box = [
+                rect_box[0] + 1,
+                rect_box[1] - border_offset,
+                rect_box[2] - 1,
+                rect_box[3] + border_offset,
+            ]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.4)
+            """
+            draw order text on layout box
+            """
+            font_size = 10
+            shape.insert_text(
+                (rect_box[0] + 1, rect_box[1] + font_size),
+                f"{order}",
+                fontsize=font_size,
+                color=(0, 0, 0),
+            )
+
+        """画上footer header"""
+        if header:
+            shape.draw_rect(fitz.Rect(header))
+            shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2)
+        if footer:
+            shape.draw_rect(fitz.Rect(footer))
+            shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2)
+
+        shape.commit()
+
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+
+
+@DeprecationWarning
+def draw_layout_on_page(
+    raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str
+):
+    """
+    把layout的box用红色边框花在pdf_path的page_idx上
+    """
+
+    def draw(shape, layout, fill_color=fitz.pdfcolor["pink"]):
+        border_offset = 1
+        rect_box = layout["layout_bbox"]
+        layout_label = layout["layout_label"]
+        sub_layout = layout["sub_layout"]
+        if len(sub_layout) == 0:
+            fill_color = fill_color if layout_label == "U" else None
+            rect_box = [
+                rect_box[0] + 1,
+                rect_box[1] - border_offset,
+                rect_box[2] - 1,
+                rect_box[3] + border_offset,
+            ]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.2)
+            # if layout_label=='U':
+            #     bad_boxes = layout.get("bad_boxes", [])
+            #     for bad_box in bad_boxes:
+            #         rect = fitz.Rect(*bad_box)
+            #         shape.draw_rect(rect)
+            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
+        # else:
+        #     rect = fitz.Rect(*rect_box)
+        #     shape.draw_rect(rect)
+        #     shape.finish(color=fitz.pdfcolor['blue'])
+
+        for sub_layout in sub_layout:
+            draw(shape, sub_layout)
+        shape.commit()
+
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open("")
+
+    page = doc[page_idx]
+    shape = page.new_shape()
+    for order, layout in enumerate(page_layout):
+        draw(shape, layout, fitz.pdfcolor["yellow"])
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(pdf_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()