Merge pull request #1120 from opendatalab/release-0.10.2

Release 0.10.2

Merge pull request #1120 from opendatalab/release-0.10.2
Release 0.10.2
8afff9ae · Xiaomeng Zhao · GitHub · 4df1eb74 · 7fdbb6e5 · 8afff9ae
Unverified Commit 8afff9ae authored Nov 27, 2024 by Xiaomeng Zhao Committed by GitHub Nov 27, 2024
20 changed files
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
 import enum
-import json
 from magic_pdf.config.model_block_type import ModelBlockTypeEnum
 from magic_pdf.config.ocr_content_type import CategoryId, ContentType
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               FileBasedDataWriter)
 from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
                                    bbox_relative_pos, box_area, calculate_iou,
                                    calculate_overlap_area_in_bbox1_area_ratio,
                                    get_overlap_area)
-from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.local_math import float_gt
 from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
@@ -1048,29 +1044,3 @@ class MagicModel:
    def get_model_list(self, page_no):
        return self.__model_list[page_no]
-if __name__ == '__main__':
-    drw = FileBasedDataReader(r'D:/project/20231108code-clean')
-    if 0:
-        pdf_file_path = r'linshixuqiu\19983-00.pdf'
-        model_file_path = r'linshixuqiu\19983-00_new.json'
-        pdf_bytes = drw.read(pdf_file_path)
-        model_json_txt = drw.read(model_file_path).decode()
-        model_list = json.loads(model_json_txt)
-        write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
-        img_bucket_path = 'imgs'
-        img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
-        pdf_docs = fitz.open('pdf', pdf_bytes)
-        magic_model = MagicModel(model_list, pdf_docs)
-    if 1:
-        from magic_pdf.data.dataset import PymuDocDataset
-        model_list = json.loads(
-            drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
-        )
-        pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
-        magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
-        for i in range(7):
-            print(magic_model.get_imgs(i))
--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
-import math
 import numpy as np
 from loguru import logger
@@ -214,6 +212,9 @@ def get_ocr_result_list(ocr_res, useful_list):
        if len(box_ocr_res) == 2:
            p1, p2, p3, p4 = box_ocr_res[0]
            text, score = box_ocr_res[1]
+            # logger.info(f"text: {text}, score: {score}")
+            if score < 0.6:  # 过滤低置信度的结果
+                continue
        else:
            p1, p2, p3, p4 = box_ocr_res
            text, score = "", 1
@@ -249,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
    return ocr_result_list
-def calculate_angle_degrees(poly):
-    # 定义对角线的顶点
-    diagonal1 = (poly[0], poly[2])
-    diagonal2 = (poly[1], poly[3])
-    # 计算对角线的斜率
-    def slope(p1, p2):
-        return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
-    slope1 = slope(diagonal1[0], diagonal1[1])
-    slope2 = slope(diagonal2[0], diagonal2[1])
-    # 计算对角线与x轴的夹角（以弧度为单位）
-    angle1_radians = math.atan(slope1)
-    angle2_radians = math.atan(slope2)
-    # 将弧度转换为角度
-    angle1_degrees = math.degrees(angle1_radians)
-    angle2_degrees = math.degrees(angle2_radians)
-    # 取两条对角线与x轴夹角的平均值
-    average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
-    # logger.info(f"average_angle_degrees: {average_angle_degrees}")
-    return average_angle_degrees
 def calculate_is_angle(poly):
    p1, p2, p3, p4 = poly
    height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2

--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):
        if det and rec:
            ocr_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                img = preprocess_image(img)
                dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
                if not dt_boxes and not rec_res:
@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
            return ocr_res
        elif det and not rec:
            ocr_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                img = preprocess_image(img)
                dt_boxes, elapse = self.text_detector(img)
                if dt_boxes is None:
@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
        else:
            ocr_res = []
            cls_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                if not isinstance(img, list):
                    img = preprocess_image(img)
                    img = [img]

--- a/magic_pdf/para/block_continuation_processor.py
+++ b/magic_pdf/para/block_continuation_processor.py
--- a/magic_pdf/para/block_termination_processor.py
+++ b/magic_pdf/para/block_termination_processor.py
--- a/magic_pdf/para/commons.py
+++ b/magic_pdf/para/commons.py
-import sys
-from magic_pdf.libs.commons import fitz
-from termcolor import cprint
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-def open_pdf(pdf_path):
-    try:
-        pdf_document = fitz.open(pdf_path)  # type: ignore
-        return pdf_document
-    except Exception as e:
-        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
-        raise e
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-def print_yellow(text):
-    print()
-    cprint(text, "yellow", attrs=["bold"], end="\n\n")
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-def is_bbox_overlap(bbox1, bbox2):
-    """
-    This function checks if bbox1 and bbox2 overlap or not
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 and bbox2 overlap, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 > x1_2 or x0_2 > x1_1:
-        return False
-    if y0_1 > y1_2 or y0_2 > y1_1:
-        return False
-    return True
-def is_in_bbox(bbox1, bbox2):
-    """
-    This function checks if bbox1 is in bbox2
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 is in bbox2, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
-        return True
-    else:
-        return False
-def calculate_para_bbox(lines):
-    """
-    This function calculates the minimum bbox of the paragraph
-    Parameters
-    ----------
-    lines : list
-        lines
-    Returns
-    -------
-    para_bbox : list
-        bbox of the paragraph
-    """
-    x0 = min(line["bbox"][0] for line in lines)
-    y0 = min(line["bbox"][1] for line in lines)
-    x1 = max(line["bbox"][2] for line in lines)
-    y1 = max(line["bbox"][3] for line in lines)
-    return [x0, y0, x1, y1]
-def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is right aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is right aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    _, _, x1, _ = curr_line_bbox
-    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x1 - prev_x1) < horizontal_thres
-    elif direction == 1:
-        return abs(x1 - next_x1) < horizontal_thres
-    elif direction == 2:
-        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
-    else:
-        return False
-def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is left aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is left aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    x0, _, _, _ = curr_line_bbox
-    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x0 - prev_x0) < horizontal_thres
-    elif direction == 1:
-        return abs(x0 - next_x0) < horizontal_thres
-    elif direction == 2:
-        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
-    else:
-        return False
-def end_with_punctuation(line_text):
-    """
-    This function checks if the line ends with punctuation marks
-    """
-    english_end_puncs = [".", "?", "!"]
-    chinese_end_puncs = ["。", "？", "！"]
-    end_puncs = english_end_puncs + chinese_end_puncs
-    last_non_space_char = None
-    for ch in line_text[::-1]:
-        if not ch.isspace():
-            last_non_space_char = ch
-            break
-    if last_non_space_char is None:
-        return False
-    return last_non_space_char in end_puncs
-def is_nested_list(lst):
-    if isinstance(lst, list):
-        return any(isinstance(sub, list) for sub in lst)
-    return False
--- a/magic_pdf/para/denoise.py
+++ b/magic_pdf/para/denoise.py
--- a/magic_pdf/para/draw.py
+++ b/magic_pdf/para/draw.py
-from magic_pdf.libs.commons import fitz
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class DrawAnnos:
-    """
-    This class draws annotations on the pdf file
-    ----------------------------------------
-                Color Code
-    ----------------------------------------
-        Red: (1, 0, 0)
-        Green: (0, 1, 0)
-        Blue: (0, 0, 1)
-        Yellow: (1, 1, 0) - mix of red and green
-        Cyan: (0, 1, 1) - mix of green and blue
-        Magenta: (1, 0, 1) - mix of red and blue
-        White: (1, 1, 1) - red, green and blue full intensity
-        Black: (0, 0, 0) - no color component whatsoever
-        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
-        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
-    """
-    def __init__(self) -> None:
-        pass
-    def __is_nested_list(self, lst):
-        """
-        This function returns True if the given list is a nested list of any degree.
-        """
-        if isinstance(lst, list):
-            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
-        return False
-    def __valid_rect(self, bbox):
-        # Ensure that the rectangle is not empty or invalid
-        if isinstance(bbox[0], list):
-            return False  # It's a nested list, hence it can't be valid rect
-        else:
-            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
-    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
-        """
-        This function draws the nested boxes
-        Parameters
-        ----------
-        page : fitz.Page
-            page
-        nested_bbox : list
-            nested bbox
-        color : tuple
-            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
-        """
-        if self.__is_nested_list(nested_bbox):  # If it's a nested list
-            for bbox in nested_bbox:
-                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
-        elif self.__valid_rect(nested_bbox):  # If valid rectangle
-            para_rect = fitz.Rect(nested_bbox)
-            para_anno = page.add_rect_annot(para_rect)
-            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
-            para_anno.set_border(width=1)
-            para_anno.update()
-    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
-        pdf_doc = open_pdf(input_pdf_path)
-        if pdf_dic is None:
-            pdf_dic = {}
-        if output_pdf_path is None:
-            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
-        for page_id, page in enumerate(pdf_doc):  # type: ignore
-            page_key = f"page_{page_id}"
-            for ele_key, ele_data in pdf_dic[page_key].items():
-                if ele_key == "para_blocks":
-                    para_blocks = ele_data
-                    for para_block in para_blocks:
-                        if "paras" in para_block.keys():
-                            paras = para_block["paras"]
-                            for para_key, para_content in paras.items():
-                                para_bbox = para_content["para_bbox"]
-                                # print(f"para_bbox: {para_bbox}")
-                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
-                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
-                                    color = (0, 1, 1)
-                                    self.__draw_nested_boxes(
-                                        page, para_bbox, color
-                                    )  # draw with cyan color for combined paragraph
-                                else:
-                                    if self.__valid_rect(para_bbox):
-                                        para_rect = fitz.Rect(para_bbox)
-                                        para_anno = page.add_rect_annot(para_rect)
-                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
-                                        para_anno.set_border(width=0.5)
-                                        para_anno.update()
-                                is_para_title = para_content["is_para_title"]
-                                if is_para_title:
-                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
-                                        color = (0, 0, 1)
-                                        self.__draw_nested_boxes(
-                                            page, para_content["para_bbox"], color
-                                        )  # draw with cyan color for combined title
-                                    else:
-                                        if self.__valid_rect(para_content["para_bbox"]):
-                                            para_rect = fitz.Rect(para_content["para_bbox"])
-                                            if self.__valid_rect(para_content["para_bbox"]):
-                                                para_anno = page.add_rect_annot(para_rect)
-                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
-                                                para_anno.set_border(width=0.5)
-                                                para_anno.update()
-        pdf_doc.save(output_pdf_path)
-        pdf_doc.close()
--- a/magic_pdf/para/exceptions.py
+++ b/magic_pdf/para/exceptions.py
--- a/magic_pdf/para/layout_match_processor.py
+++ b/magic_pdf/para/layout_match_processor.py
-import math
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class LayoutFilterProcessor:
-    def __init__(self) -> None:
-        pass
-    def batch_process_blocks(self, pdf_dict):
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
-                    layout_bbox_objs = blocks["layout_bboxes"]
-                    if layout_bbox_objs is None:
-                        continue
-                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
-                    # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
-                    layout_bboxes = [
-                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
-                    ]
-                    para_blocks = blocks["para_blocks"]
-                    if para_blocks is None:
-                        continue
-                    for lb_bbox in layout_bboxes:
-                        for i, para_block in enumerate(para_blocks):
-                            para_bbox = para_block["bbox"]
-                            para_blocks[i]["in_layout"] = 0
-                            if is_in_bbox(para_bbox, lb_bbox):
-                                para_blocks[i]["in_layout"] = 1
-                    blocks["para_blocks"] = para_blocks
-        return pdf_dict
--- a/magic_pdf/para/para_split.py
+++ b/magic_pdf/para/para_split.py
--- a/magic_pdf/para/para_split_v2.py
+++ b/magic_pdf/para/para_split_v2.py
--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
--- a/magic_pdf/para/raw_processor.py
+++ b/magic_pdf/para/raw_processor.py
--- a/magic_pdf/para/stats.py
+++ b/magic_pdf/para/stats.py
--- a/magic_pdf/para/title_processor.py
+++ b/magic_pdf/para/title_processor.py
--- a/magic_pdf/pdf_parse_union_core.py
+++ b/magic_pdf/pdf_parse_union_core.py
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
--- a/magic_pdf/post_proc/__init__.py
+++ b/magic_pdf/post_proc/__init__.py
--- a/magic_pdf/post_proc/detect_para.py
+++ b/magic_pdf/post_proc/detect_para.py