Unverified Commit 8afff9ae authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1120 from opendatalab/release-0.10.2

Release 0.10.2
parents 4df1eb74 7fdbb6e5
import enum import enum
import json
from magic_pdf.config.model_block_type import ModelBlockTypeEnum from magic_pdf.config.model_block_type import ModelBlockTypeEnum
from magic_pdf.config.ocr_content_type import CategoryId, ContentType from magic_pdf.config.ocr_content_type import CategoryId, ContentType
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
from magic_pdf.data.dataset import Dataset from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance, from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
bbox_relative_pos, box_area, calculate_iou, bbox_relative_pos, box_area, calculate_iou,
calculate_overlap_area_in_bbox1_area_ratio, calculate_overlap_area_in_bbox1_area_ratio,
get_overlap_area) get_overlap_area)
from magic_pdf.libs.commons import fitz, join_path
from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.local_math import float_gt from magic_pdf.libs.local_math import float_gt
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
...@@ -1048,29 +1044,3 @@ class MagicModel: ...@@ -1048,29 +1044,3 @@ class MagicModel:
def get_model_list(self, page_no): def get_model_list(self, page_no):
return self.__model_list[page_no] return self.__model_list[page_no]
if __name__ == '__main__':
drw = FileBasedDataReader(r'D:/project/20231108code-clean')
if 0:
pdf_file_path = r'linshixuqiu\19983-00.pdf'
model_file_path = r'linshixuqiu\19983-00_new.json'
pdf_bytes = drw.read(pdf_file_path)
model_json_txt = drw.read(model_file_path).decode()
model_list = json.loads(model_json_txt)
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
img_bucket_path = 'imgs'
img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
pdf_docs = fitz.open('pdf', pdf_bytes)
magic_model = MagicModel(model_list, pdf_docs)
if 1:
from magic_pdf.data.dataset import PymuDocDataset
model_list = json.loads(
drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
)
pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
for i in range(7):
print(magic_model.get_imgs(i))
import math
import numpy as np import numpy as np
from loguru import logger from loguru import logger
...@@ -214,6 +212,9 @@ def get_ocr_result_list(ocr_res, useful_list): ...@@ -214,6 +212,9 @@ def get_ocr_result_list(ocr_res, useful_list):
if len(box_ocr_res) == 2: if len(box_ocr_res) == 2:
p1, p2, p3, p4 = box_ocr_res[0] p1, p2, p3, p4 = box_ocr_res[0]
text, score = box_ocr_res[1] text, score = box_ocr_res[1]
# logger.info(f"text: {text}, score: {score}")
if score < 0.6: # 过滤低置信度的结果
continue
else: else:
p1, p2, p3, p4 = box_ocr_res p1, p2, p3, p4 = box_ocr_res
text, score = "", 1 text, score = "", 1
...@@ -249,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list): ...@@ -249,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
return ocr_result_list return ocr_result_list
def calculate_angle_degrees(poly):
# 定义对角线的顶点
diagonal1 = (poly[0], poly[2])
diagonal2 = (poly[1], poly[3])
# 计算对角线的斜率
def slope(p1, p2):
return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
slope1 = slope(diagonal1[0], diagonal1[1])
slope2 = slope(diagonal2[0], diagonal2[1])
# 计算对角线与x轴的夹角(以弧度为单位)
angle1_radians = math.atan(slope1)
angle2_radians = math.atan(slope2)
# 将弧度转换为角度
angle1_degrees = math.degrees(angle1_radians)
angle2_degrees = math.degrees(angle2_radians)
# 取两条对角线与x轴夹角的平均值
average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
# logger.info(f"average_angle_degrees: {average_angle_degrees}")
return average_angle_degrees
def calculate_is_angle(poly): def calculate_is_angle(poly):
p1, p2, p3, p4 = poly p1, p2, p3, p4 = poly
height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2 height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2
......
...@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR): ...@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):
if det and rec: if det and rec:
ocr_res = [] ocr_res = []
for idx, img in enumerate(imgs): for img in imgs:
img = preprocess_image(img) img = preprocess_image(img)
dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res) dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
if not dt_boxes and not rec_res: if not dt_boxes and not rec_res:
...@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR): ...@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
return ocr_res return ocr_res
elif det and not rec: elif det and not rec:
ocr_res = [] ocr_res = []
for idx, img in enumerate(imgs): for img in imgs:
img = preprocess_image(img) img = preprocess_image(img)
dt_boxes, elapse = self.text_detector(img) dt_boxes, elapse = self.text_detector(img)
if dt_boxes is None: if dt_boxes is None:
...@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR): ...@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
else: else:
ocr_res = [] ocr_res = []
cls_res = [] cls_res = []
for idx, img in enumerate(imgs): for img in imgs:
if not isinstance(img, list): if not isinstance(img, list):
img = preprocess_image(img) img = preprocess_image(img)
img = [img] img = [img]
......
This diff is collapsed.
This diff is collapsed.
import sys
from magic_pdf.libs.commons import fitz
from termcolor import cprint
if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
def open_pdf(pdf_path):
try:
pdf_document = fitz.open(pdf_path) # type: ignore
return pdf_document
except Exception as e:
print(f"无法打开PDF文件:{pdf_path}。原因是:{e}")
raise e
def print_green_on_red(text):
cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
def print_green(text):
print()
cprint(text, "green", attrs=["bold"], end="\n\n")
def print_red(text):
print()
cprint(text, "red", attrs=["bold"], end="\n\n")
def print_yellow(text):
print()
cprint(text, "yellow", attrs=["bold"], end="\n\n")
def safe_get(dict_obj, key, default):
val = dict_obj.get(key)
if val is None:
return default
else:
return val
def is_bbox_overlap(bbox1, bbox2):
"""
This function checks if bbox1 and bbox2 overlap or not
Parameters
----------
bbox1 : list
bbox1
bbox2 : list
bbox2
Returns
-------
bool
True if bbox1 and bbox2 overlap, else False
"""
x0_1, y0_1, x1_1, y1_1 = bbox1
x0_2, y0_2, x1_2, y1_2 = bbox2
if x0_1 > x1_2 or x0_2 > x1_1:
return False
if y0_1 > y1_2 or y0_2 > y1_1:
return False
return True
def is_in_bbox(bbox1, bbox2):
"""
This function checks if bbox1 is in bbox2
Parameters
----------
bbox1 : list
bbox1
bbox2 : list
bbox2
Returns
-------
bool
True if bbox1 is in bbox2, else False
"""
x0_1, y0_1, x1_1, y1_1 = bbox1
x0_2, y0_2, x1_2, y1_2 = bbox2
if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
return True
else:
return False
def calculate_para_bbox(lines):
"""
This function calculates the minimum bbox of the paragraph
Parameters
----------
lines : list
lines
Returns
-------
para_bbox : list
bbox of the paragraph
"""
x0 = min(line["bbox"][0] for line in lines)
y0 = min(line["bbox"][1] for line in lines)
x1 = max(line["bbox"][2] for line in lines)
y1 = max(line["bbox"][3] for line in lines)
return [x0, y0, x1, y1]
def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
"""
This function checks if the line is right aligned from its neighbors
Parameters
----------
curr_line_bbox : list
bbox of the current line
prev_line_bbox : list
bbox of the previous line
next_line_bbox : list
bbox of the next line
avg_char_width : float
average of char widths
direction : int
0 for prev, 1 for next, 2 for both
Returns
-------
bool
True if the line is right aligned from its neighbors, False otherwise.
"""
horizontal_ratio = 0.5
horizontal_thres = horizontal_ratio * avg_char_width
_, _, x1, _ = curr_line_bbox
_, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
_, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
if direction == 0:
return abs(x1 - prev_x1) < horizontal_thres
elif direction == 1:
return abs(x1 - next_x1) < horizontal_thres
elif direction == 2:
return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
else:
return False
def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
"""
This function checks if the line is left aligned from its neighbors
Parameters
----------
curr_line_bbox : list
bbox of the current line
prev_line_bbox : list
bbox of the previous line
next_line_bbox : list
bbox of the next line
avg_char_width : float
average of char widths
direction : int
0 for prev, 1 for next, 2 for both
Returns
-------
bool
True if the line is left aligned from its neighbors, False otherwise.
"""
horizontal_ratio = 0.5
horizontal_thres = horizontal_ratio * avg_char_width
x0, _, _, _ = curr_line_bbox
prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
if direction == 0:
return abs(x0 - prev_x0) < horizontal_thres
elif direction == 1:
return abs(x0 - next_x0) < horizontal_thres
elif direction == 2:
return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
else:
return False
def end_with_punctuation(line_text):
"""
This function checks if the line ends with punctuation marks
"""
english_end_puncs = [".", "?", "!"]
chinese_end_puncs = ["。", "?", "!"]
end_puncs = english_end_puncs + chinese_end_puncs
last_non_space_char = None
for ch in line_text[::-1]:
if not ch.isspace():
last_non_space_char = ch
break
if last_non_space_char is None:
return False
return last_non_space_char in end_puncs
def is_nested_list(lst):
if isinstance(lst, list):
return any(isinstance(sub, list) for sub in lst)
return False
This diff is collapsed.
from magic_pdf.libs.commons import fitz
from magic_pdf.para.commons import *
if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
class DrawAnnos:
"""
This class draws annotations on the pdf file
----------------------------------------
Color Code
----------------------------------------
Red: (1, 0, 0)
Green: (0, 1, 0)
Blue: (0, 0, 1)
Yellow: (1, 1, 0) - mix of red and green
Cyan: (0, 1, 1) - mix of green and blue
Magenta: (1, 0, 1) - mix of red and blue
White: (1, 1, 1) - red, green and blue full intensity
Black: (0, 0, 0) - no color component whatsoever
Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
"""
def __init__(self) -> None:
pass
def __is_nested_list(self, lst):
"""
This function returns True if the given list is a nested list of any degree.
"""
if isinstance(lst, list):
return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
return False
def __valid_rect(self, bbox):
# Ensure that the rectangle is not empty or invalid
if isinstance(bbox[0], list):
return False # It's a nested list, hence it can't be valid rect
else:
return bbox[0] < bbox[2] and bbox[1] < bbox[3]
def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
"""
This function draws the nested boxes
Parameters
----------
page : fitz.Page
page
nested_bbox : list
nested bbox
color : tuple
color, by default (0, 1, 1) # draw with cyan color for combined paragraph
"""
if self.__is_nested_list(nested_bbox): # If it's a nested list
for bbox in nested_bbox:
self.__draw_nested_boxes(page, bbox, color) # Recursively call the function
elif self.__valid_rect(nested_bbox): # If valid rectangle
para_rect = fitz.Rect(nested_bbox)
para_anno = page.add_rect_annot(para_rect)
para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph
para_anno.set_border(width=1)
para_anno.update()
def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
pdf_doc = open_pdf(input_pdf_path)
if pdf_dic is None:
pdf_dic = {}
if output_pdf_path is None:
output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
for page_id, page in enumerate(pdf_doc): # type: ignore
page_key = f"page_{page_id}"
for ele_key, ele_data in pdf_dic[page_key].items():
if ele_key == "para_blocks":
para_blocks = ele_data
for para_block in para_blocks:
if "paras" in para_block.keys():
paras = para_block["paras"]
for para_key, para_content in paras.items():
para_bbox = para_content["para_bbox"]
# print(f"para_bbox: {para_bbox}")
# print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
color = (0, 1, 1)
self.__draw_nested_boxes(
page, para_bbox, color
) # draw with cyan color for combined paragraph
else:
if self.__valid_rect(para_bbox):
para_rect = fitz.Rect(para_bbox)
para_anno = page.add_rect_annot(para_rect)
para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph
para_anno.set_border(width=0.5)
para_anno.update()
is_para_title = para_content["is_para_title"]
if is_para_title:
if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
color = (0, 0, 1)
self.__draw_nested_boxes(
page, para_content["para_bbox"], color
) # draw with cyan color for combined title
else:
if self.__valid_rect(para_content["para_bbox"]):
para_rect = fitz.Rect(para_content["para_bbox"])
if self.__valid_rect(para_content["para_bbox"]):
para_anno = page.add_rect_annot(para_rect)
para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title
para_anno.set_border(width=0.5)
para_anno.update()
pdf_doc.save(output_pdf_path)
pdf_doc.close()
This diff is collapsed.
import math
from magic_pdf.para.commons import *
if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
class LayoutFilterProcessor:
def __init__(self) -> None:
pass
def batch_process_blocks(self, pdf_dict):
for page_id, blocks in pdf_dict.items():
if page_id.startswith("page_"):
if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
layout_bbox_objs = blocks["layout_bboxes"]
if layout_bbox_objs is None:
continue
layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
# Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
layout_bboxes = [
[math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
]
para_blocks = blocks["para_blocks"]
if para_blocks is None:
continue
for lb_bbox in layout_bboxes:
for i, para_block in enumerate(para_blocks):
para_bbox = para_block["bbox"]
para_blocks[i]["in_layout"] = 0
if is_in_bbox(para_bbox, lb_bbox):
para_blocks[i]["in_layout"] = 1
blocks["para_blocks"] = para_blocks
return pdf_dict
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment