Unverified Commit 6575adea authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #808 from opendatalab/dev

Dev->0.9 release
parents 82dd7ac5 37dd55c4
...@@ -70,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -70,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 3rd.拼image_footnote
if block['type'] == BlockType.ImageFootnote: if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block) + ' \n'
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
for block in para_block['blocks']: # 1st.拼table_caption for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption: if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼table_body for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TableBody: if block['type'] == BlockType.TableBody:
for line in block['lines']: for line in block['lines']:
...@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 3rd.拼table_footnote for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote: if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block) + ' \n'
if para_text.strip() == '': if para_text.strip() == '':
continue continue
...@@ -180,18 +180,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason ...@@ -180,18 +180,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
'text_format': 'latex', 'text_format': 'latex',
} }
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
para_content = {'type': 'image'} para_content = {'type': 'image', 'img_caption': [], 'img_footnote': []}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody: if block['type'] == BlockType.ImageBody:
para_content['img_path'] = join_path( para_content['img_path'] = join_path(
img_buket_path, img_buket_path,
block['lines'][0]['spans'][0]['image_path']) block['lines'][0]['spans'][0]['image_path'])
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_content['img_caption'] = merge_para_with_text(block) para_content['img_caption'].append(merge_para_with_text(block))
if block['type'] == BlockType.ImageFootnote: if block['type'] == BlockType.ImageFootnote:
para_content['img_footnote'] = merge_para_with_text(block) para_content['img_footnote'].append(merge_para_with_text(block))
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
para_content = {'type': 'table'} para_content = {'type': 'table', 'table_caption': [], 'table_footnote': []}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.TableBody: if block['type'] == BlockType.TableBody:
if block["lines"][0]["spans"][0].get('latex', ''): if block["lines"][0]["spans"][0].get('latex', ''):
...@@ -200,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason ...@@ -200,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n" para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path']) para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
if block['type'] == BlockType.TableCaption: if block['type'] == BlockType.TableCaption:
para_content['table_caption'] = merge_para_with_text(block) para_content['table_caption'].append(merge_para_with_text(block))
if block['type'] == BlockType.TableFootnote: if block['type'] == BlockType.TableFootnote:
para_content['table_footnote'] = merge_para_with_text(block) para_content['table_footnote'].append(merge_para_with_text(block))
para_content['page_idx'] = page_idx para_content['page_idx'] = page_idx
......
...@@ -23,14 +23,20 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt" ...@@ -23,14 +23,20 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
TABLE_MASTER_DIR = "table_structure_tablemaster_infer/" TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
# pp detect model dir # pp detect model dir
DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer" DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
# pp rec model dir # pp rec model dir
REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer" REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
# pp rec char dict path # pp rec char dict path
REC_CHAR_DICT = "ppocr_keys_v1.txt" REC_CHAR_DICT = "ppocr_keys_v1.txt"
# pp rec copy rec directory
PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
# pp rec copy det directory
PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
class MODEL_NAME: class MODEL_NAME:
# pp table structure algorithm # pp table structure algorithm
......
...@@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = [] layout_bbox_list = []
table_type_order = {
'table_caption': 1,
'table_body': 2,
'table_footnote': 3
}
for page in pdf_info: for page in pdf_info:
page_block_list = [] page_block_list = []
for block in page['para_blocks']: for block in page['para_blocks']:
if block['type'] in [
BlockType.Text,
BlockType.Title,
BlockType.InterlineEquation,
BlockType.List,
BlockType.Index,
]:
bbox = block['bbox'] bbox = block['bbox']
page_block_list.append(bbox) page_block_list.append(bbox)
elif block['type'] in [BlockType.Image]:
for sub_block in block['blocks']:
bbox = sub_block['bbox']
page_block_list.append(bbox)
elif block['type'] in [BlockType.Table]:
sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
for sub_block in sorted_blocks:
bbox = sub_block['bbox']
page_block_list.append(bbox)
layout_bbox_list.append(page_block_list) layout_bbox_list.append(page_block_list)
pdf_docs = fitz.open('pdf', pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
...@@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True) draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color ! # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True) draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True) draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True) draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True) # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True) draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True), draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
...@@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
for page in pdf_info: for page in pdf_info:
page_line_list = [] page_line_list = []
for block in page['preproc_blocks']: for block in page['preproc_blocks']:
if block['type'] in ['text', 'title', 'interline_equation']: if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
for line in block['lines']: for line in block['lines']:
bbox = line['bbox'] bbox = line['bbox']
index = line['index'] index = line['index']
page_line_list.append({'index': index, 'bbox': bbox}) page_line_list.append({'index': index, 'bbox': bbox})
if block['type'] in ['table', 'image']: if block['type'] in [BlockType.Image, BlockType.Table]:
bbox = block['bbox'] for sub_block in block['blocks']:
index = block['index'] if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
for line in sub_block['virtual_lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
for line in sub_block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox}) page_line_list.append({'index': index, 'bbox': bbox})
# for line in block['lines']:
# bbox = line['bbox']
# index = line['index']
# page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index']) sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes) layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
pdf_docs = fitz.open('pdf', pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
......
import enum
import json import json
from magic_pdf.data.dataset import Dataset from magic_pdf.data.dataset import Dataset
...@@ -10,6 +11,7 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio ...@@ -10,6 +11,7 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.local_math import float_gt from magic_pdf.libs.local_math import float_gt
from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
from magic_pdf.libs.ocr_content_type import CategoryId, ContentType from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
...@@ -17,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6 ...@@ -17,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
MERGE_BOX_OVERLAP_AREA_RATIO = 1.1 MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
class PosRelationEnum(enum.Enum):
LEFT = 'left'
RIGHT = 'right'
UP = 'up'
BOTTOM = 'bottom'
ALL = 'all'
class MagicModel: class MagicModel:
"""每个函数没有得到元素的时候返回空list.""" """每个函数没有得到元素的时候返回空list."""
...@@ -124,8 +134,7 @@ class MagicModel: ...@@ -124,8 +134,7 @@ class MagicModel:
l1 = bbox1[2] - bbox1[0] l1 = bbox1[2] - bbox1[0]
l2 = bbox2[2] - bbox2[0] l2 = bbox2[2] - bbox2[0]
min_l, max_l = min(l1, l2), max(l1, l2) if l2 > l1 and (l2 - l1) / l1 > 0.3:
if (max_l - min_l) * 1.0 / max_l > 0.4:
return float('inf') return float('inf')
return bbox_distance(bbox1, bbox2) return bbox_distance(bbox1, bbox2)
...@@ -591,9 +600,24 @@ class MagicModel: ...@@ -591,9 +600,24 @@ class MagicModel:
return ret, total_subject_object_dis return ret, total_subject_object_dis
def __tie_up_category_by_distance_v2( def __tie_up_category_by_distance_v2(
self, page_no, subject_category_id, object_category_id self,
page_no: int,
subject_category_id: int,
object_category_id: int,
priority_pos: PosRelationEnum,
): ):
"""_summary_
Args:
page_no (int): _description_
subject_category_id (int): _description_
object_category_id (int): _description_
priority_pos (PosRelationEnum): _description_
Returns:
_type_: _description_
"""
AXIS_MULPLICITY = 0.5
subjects = self.__reduct_overlap( subjects = self.__reduct_overlap(
list( list(
map( map(
...@@ -617,67 +641,244 @@ class MagicModel: ...@@ -617,67 +641,244 @@ class MagicModel:
) )
) )
) )
M = len(objects)
subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
dis = [[float('inf')] * len(subjects) for _ in range(len(objects))]
sub_obj_map_h = {i: [] for i in range(len(subjects))}
dis_by_directions = {
'top': [[-1, float('inf')]] * M,
'bottom': [[-1, float('inf')]] * M,
'left': [[-1, float('inf')]] * M,
'right': [[-1, float('inf')]] * M,
}
for i, obj in enumerate(objects): for i, obj in enumerate(objects):
l_x_axis, l_y_axis = (
obj['bbox'][2] - obj['bbox'][0],
obj['bbox'][3] - obj['bbox'][1],
)
axis_unit = min(l_x_axis, l_y_axis)
for j, sub in enumerate(subjects): for j, sub in enumerate(subjects):
dis[i][j] = self._bbox_distance(sub['bbox'], obj['bbox'])
sub_obj_map_h = {i: [] for i in range(len(subjects))} bbox1, bbox2, _ = _remove_overlap_between_bbox(
for i in range(len(objects)): objects[i]['bbox'], subjects[j]['bbox']
min_l_idx = 0 )
for j in range(1, len(subjects)): left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
if dis[i][j] == float('inf'): flags = [left, right, bottom, top]
if sum([1 if v else 0 for v in flags]) > 1:
continue continue
if dis[i][j] < dis[i][min_l_idx]:
min_l_idx = j
if dis[i][min_l_idx] < float('inf'): if left:
sub_obj_map_h[min_l_idx].append(i) if dis_by_directions['left'][i][1] > bbox_distance(
obj['bbox'], sub['bbox']
):
dis_by_directions['left'][i] = [
j,
bbox_distance(obj['bbox'], sub['bbox']),
]
if right:
if dis_by_directions['right'][i][1] > bbox_distance(
obj['bbox'], sub['bbox']
):
dis_by_directions['right'][i] = [
j,
bbox_distance(obj['bbox'], sub['bbox']),
]
if bottom:
if dis_by_directions['bottom'][i][1] > bbox_distance(
obj['bbox'], sub['bbox']
):
dis_by_directions['bottom'][i] = [
j,
bbox_distance(obj['bbox'], sub['bbox']),
]
if top:
if dis_by_directions['top'][i][1] > bbox_distance(
obj['bbox'], sub['bbox']
):
dis_by_directions['top'][i] = [
j,
bbox_distance(obj['bbox'], sub['bbox']),
]
if (
dis_by_directions['top'][i][1] != float('inf')
and dis_by_directions['bottom'][i][1] != float('inf')
and priority_pos in (PosRelationEnum.BOTTOM, PosRelationEnum.UP)
):
RATIO = 3
if (
abs(
dis_by_directions['top'][i][1]
- dis_by_directions['bottom'][i][1]
)
< RATIO * axis_unit
):
if priority_pos == PosRelationEnum.BOTTOM:
sub_obj_map_h[dis_by_directions['bottom'][i][0]].append(i)
else: else:
print(i, 'no nearest') sub_obj_map_h[dis_by_directions['top'][i][0]].append(i)
continue
if dis_by_directions['left'][i][1] != float('inf') or dis_by_directions[
'right'
][i][1] != float('inf'):
if dis_by_directions['left'][i][1] != float(
'inf'
) and dis_by_directions['right'][i][1] != float('inf'):
if AXIS_MULPLICITY * axis_unit >= abs(
dis_by_directions['left'][i][1]
- dis_by_directions['right'][i][1]
):
left_sub_bbox = subjects[dis_by_directions['left'][i][0]][
'bbox'
]
right_sub_bbox = subjects[dis_by_directions['right'][i][0]][
'bbox'
]
left_sub_bbox_y_axis = left_sub_bbox[3] - left_sub_bbox[1]
right_sub_bbox_y_axis = right_sub_bbox[3] - right_sub_bbox[1]
if (
abs(left_sub_bbox_y_axis - l_y_axis)
+ dis_by_directions['left'][i][0]
> abs(right_sub_bbox_y_axis - l_y_axis)
+ dis_by_directions['right'][i][0]
):
left_or_right = dis_by_directions['right'][i]
else:
left_or_right = dis_by_directions['left'][i]
else:
left_or_right = dis_by_directions['left'][i]
if left_or_right[1] > dis_by_directions['right'][i][1]:
left_or_right = dis_by_directions['right'][i]
else:
left_or_right = dis_by_directions['left'][i]
if left_or_right[1] == float('inf'):
left_or_right = dis_by_directions['right'][i]
else:
left_or_right = [-1, float('inf')]
if dis_by_directions['top'][i][1] != float('inf') or dis_by_directions[
'bottom'
][i][1] != float('inf'):
if dis_by_directions['top'][i][1] != float('inf') and dis_by_directions[
'bottom'
][i][1] != float('inf'):
if AXIS_MULPLICITY * axis_unit >= abs(
dis_by_directions['top'][i][1]
- dis_by_directions['bottom'][i][1]
):
top_bottom = subjects[dis_by_directions['bottom'][i][0]]['bbox']
bottom_top = subjects[dis_by_directions['top'][i][0]]['bbox']
top_bottom_x_axis = top_bottom[2] - top_bottom[0]
bottom_top_x_axis = bottom_top[2] - bottom_top[0]
if (
abs(top_bottom_x_axis - l_x_axis)
+ dis_by_directions['bottom'][i][1]
> abs(bottom_top_x_axis - l_x_axis)
+ dis_by_directions['top'][i][1]
):
top_or_bottom = dis_by_directions['top'][i]
else:
top_or_bottom = dis_by_directions['bottom'][i]
else:
top_or_bottom = dis_by_directions['top'][i]
if top_or_bottom[1] > dis_by_directions['bottom'][i][1]:
top_or_bottom = dis_by_directions['bottom'][i]
else:
top_or_bottom = dis_by_directions['top'][i]
if top_or_bottom[1] == float('inf'):
top_or_bottom = dis_by_directions['bottom'][i]
else:
top_or_bottom = [-1, float('inf')]
if left_or_right[1] != float('inf') or top_or_bottom[1] != float('inf'):
if left_or_right[1] != float('inf') and top_or_bottom[1] != float(
'inf'
):
if AXIS_MULPLICITY * axis_unit >= abs(
left_or_right[1] - top_or_bottom[1]
):
y_axis_bbox = subjects[left_or_right[0]]['bbox']
x_axis_bbox = subjects[top_or_bottom[0]]['bbox']
if (
abs((x_axis_bbox[2] - x_axis_bbox[0]) - l_x_axis) / l_x_axis
> abs((y_axis_bbox[3] - y_axis_bbox[1]) - l_y_axis)
/ l_y_axis
):
sub_obj_map_h[left_or_right[0]].append(i)
else:
sub_obj_map_h[top_or_bottom[0]].append(i)
else:
if left_or_right[1] > top_or_bottom[1]:
sub_obj_map_h[top_or_bottom[0]].append(i)
else:
sub_obj_map_h[left_or_right[0]].append(i)
else:
if left_or_right[1] != float('inf'):
sub_obj_map_h[left_or_right[0]].append(i)
else:
sub_obj_map_h[top_or_bottom[0]].append(i)
ret = [] ret = []
for i in sub_obj_map_h.keys(): for i in sub_obj_map_h.keys():
ret.append( ret.append(
{ {
'sub_bbox': subjects[i]['bbox'], 'sub_bbox': {
'obj_bboxes': [objects[j]['bbox'] for j in sub_obj_map_h[i]], 'bbox': subjects[i]['bbox'],
'score': subjects[i]['score'],
},
'obj_bboxes': [
{'score': objects[j]['score'], 'bbox': objects[j]['bbox']}
for j in sub_obj_map_h[i]
],
'sub_idx': i, 'sub_idx': i,
} }
) )
return ret return ret
def get_imgs_v2(self, page_no: int): def get_imgs_v2(self, page_no: int):
with_captions = self.__tie_up_category_by_distance_v2(page_no, 3, 4) with_captions = self.__tie_up_category_by_distance_v2(
page_no, 3, 4, PosRelationEnum.BOTTOM
)
with_footnotes = self.__tie_up_category_by_distance_v2( with_footnotes = self.__tie_up_category_by_distance_v2(
page_no, 3, CategoryId.ImageFootnote page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
) )
ret = [] ret = []
for v in with_captions: for v in with_captions:
record = { record = {
'image_bbox': v['sub_bbox'], 'image_body': v['sub_bbox'],
'image_caption_bbox_list': v['obj_bboxes'], 'image_caption_list': v['obj_bboxes'],
} }
filter_idx = v['sub_idx'] filter_idx = v['sub_idx']
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes)) d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
record['image_footnote_bbox_list'] = d['obj_bboxes'] record['image_footnote_list'] = d['obj_bboxes']
ret.append(record) ret.append(record)
return ret return ret
def get_tables_v2(self, page_no: int) -> list: def get_tables_v2(self, page_no: int) -> list:
with_captions = self.__tie_up_category_by_distance_v2(page_no, 5, 6) with_captions = self.__tie_up_category_by_distance_v2(
with_footnotes = self.__tie_up_category_by_distance_v2(page_no, 5, 7) page_no, 5, 6, PosRelationEnum.UP
)
with_footnotes = self.__tie_up_category_by_distance_v2(
page_no, 5, 7, PosRelationEnum.ALL
)
ret = [] ret = []
for v in with_captions: for v in with_captions:
record = { record = {
'table_bbox': v['sub_bbox'], 'table_body': v['sub_bbox'],
'table_caption_bbox_list': v['obj_bboxes'], 'table_caption_list': v['obj_bboxes'],
} }
filter_idx = v['sub_idx'] filter_idx = v['sub_idx']
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes)) d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
record['table_footnote_bbox_list'] = d['obj_bboxes'] record['table_footnote_list'] = d['obj_bboxes']
ret.append(record) ret.append(record)
return ret return ret
......
from loguru import logger from loguru import logger
import os import os
import time import time
from pathlib import Path
import shutil
from magic_pdf.libs.Constants import * from magic_pdf.libs.Constants import *
from magic_pdf.libs.clean_memory import clean_memory from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.model.model_list import AtomicModel from magic_pdf.model.model_list import AtomicModel
...@@ -37,19 +38,24 @@ except ImportError as e: ...@@ -37,19 +38,24 @@ except ImportError as e:
from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel # from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
from magic_pdf.model.ppTableModel import ppTableModel from magic_pdf.model.ppTableModel import ppTableModel
def table_model_init(table_model_type, model_path, max_time, _device_='cpu'): def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
if table_model_type == MODEL_NAME.STRUCT_EQTABLE: if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
table_model = StructTableModel(model_path, max_time=max_time, device=_device_) # table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
else: logger.error("StructEqTable is under upgrade, the current version does not support it.")
exit(1)
elif table_model_type == MODEL_NAME.TABLE_MASTER:
config = { config = {
"model_dir": model_path, "model_dir": model_path,
"device": _device_ "device": _device_
} }
table_model = ppTableModel(config) table_model = ppTableModel(config)
else:
logger.error("table model type not allow")
exit(1)
return table_model return table_model
...@@ -83,7 +89,7 @@ def doclayout_yolo_model_init(weight): ...@@ -83,7 +89,7 @@ def doclayout_yolo_model_init(weight):
return model return model
def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=2.4): def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=1.8):
if lang is not None: if lang is not None:
model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio) model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
else: else:
...@@ -297,6 +303,17 @@ class CustomPEKModel: ...@@ -297,6 +303,17 @@ class CustomPEKModel:
device=self.device device=self.device
) )
home_directory = Path.home()
det_source = os.path.join(models_dir, table_model_dir, DETECT_MODEL_DIR)
rec_source = os.path.join(models_dir, table_model_dir, REC_MODEL_DIR)
det_dest_dir = os.path.join(home_directory, PP_DET_DIRECTORY)
rec_dest_dir = os.path.join(home_directory, PP_REC_DIRECTORY)
if not os.path.exists(det_dest_dir):
shutil.copytree(det_source, det_dest_dir)
if not os.path.exists(rec_dest_dir):
shutil.copytree(rec_source, rec_dest_dir)
logger.info('DocAnalysis init done!') logger.info('DocAnalysis init done!')
def __call__(self, image): def __call__(self, image):
...@@ -314,7 +331,7 @@ class CustomPEKModel: ...@@ -314,7 +331,7 @@ class CustomPEKModel:
elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO: elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
# doclayout_yolo # doclayout_yolo
layout_res = [] layout_res = []
doclayout_yolo_res = self.layout_model.predict(image, imgsz=1024, conf=0.15, iou=0.45, verbose=True, device=self.device)[0] doclayout_yolo_res = self.layout_model.predict(image, imgsz=1024, conf=0.25, iou=0.45, verbose=True, device=self.device)[0]
for xyxy, conf, cla in zip(doclayout_yolo_res.boxes.xyxy.cpu(), doclayout_yolo_res.boxes.conf.cpu(), doclayout_yolo_res.boxes.cls.cpu()): for xyxy, conf, cla in zip(doclayout_yolo_res.boxes.xyxy.cpu(), doclayout_yolo_res.boxes.conf.cpu(), doclayout_yolo_res.boxes.cls.cpu()):
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
new_item = { new_item = {
...@@ -472,3 +489,5 @@ class CustomPEKModel: ...@@ -472,3 +489,5 @@ class CustomPEKModel:
logger.info(f"-----page total time: {round(time.time() - page_start, 2)}-----") logger.info(f"-----page total time: {round(time.time() - page_start, 2)}-----")
return layout_res return layout_res
from struct_eqtable.model import StructTable from loguru import logger
try:
from struct_eqtable.model import StructTable
except ImportError:
logger.error("StructEqTable is under upgrade, the current version does not support it.")
from pypandoc import convert_text from pypandoc import convert_text
class StructTableModel: class StructTableModel:
def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'): def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
# init # init
......
import copy
import os import os
import statistics import statistics
import time import time
...@@ -15,7 +16,7 @@ from magic_pdf.libs.convert_utils import dict_to_list ...@@ -15,7 +16,7 @@ from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.local_math import float_equal from magic_pdf.libs.local_math import float_equal
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v3 import para_split from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
...@@ -29,7 +30,7 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \ ...@@ -29,7 +30,7 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2 ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks, from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
fix_block_spans, fix_block_spans,
fix_discarded_block) fix_discarded_block, fix_block_spans_v2)
from magic_pdf.pre_proc.ocr_span_list_modify import ( from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans, get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans) remove_overlaps_min_spans)
...@@ -173,19 +174,6 @@ def do_predict(boxes: List[List[int]], model) -> List[int]: ...@@ -173,19 +174,6 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
def cal_block_index(fix_blocks, sorted_bboxes): def cal_block_index(fix_blocks, sorted_bboxes):
for block in fix_blocks: for block in fix_blocks:
# if block['type'] in ['text', 'title', 'interline_equation']:
# line_index_list = []
# if len(block['lines']) == 0:
# block['index'] = sorted_bboxes.index(block['bbox'])
# else:
# for line in block['lines']:
# line['index'] = sorted_bboxes.index(line['bbox'])
# line_index_list.append(line['index'])
# median_value = statistics.median(line_index_list)
# block['index'] = median_value
#
# elif block['type'] in ['table', 'image']:
# block['index'] = sorted_bboxes.index(block['bbox'])
line_index_list = [] line_index_list = []
if len(block['lines']) == 0: if len(block['lines']) == 0:
...@@ -197,9 +185,11 @@ def cal_block_index(fix_blocks, sorted_bboxes): ...@@ -197,9 +185,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
median_value = statistics.median(line_index_list) median_value = statistics.median(line_index_list)
block['index'] = median_value block['index'] = median_value
# 删除图表block中的虚拟line信息 # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
if block['type'] in ['table', 'image']: if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
del block['lines'] block['virtual_lines'] = copy.deepcopy(block['lines'])
block['lines'] = copy.deepcopy(block['real_lines'])
del block['real_lines']
return fix_blocks return fix_blocks
...@@ -218,13 +208,12 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h): ...@@ -218,13 +208,12 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
): # 可能是双列结构,可以切细点 ): # 可能是双列结构,可以切细点
lines = int(block_height / line_height) + 1 lines = int(block_height / line_height) + 1
else: else:
# 如果block的宽度超过0.4页面宽度,则将block分成3行 # 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
if block_weight > page_w * 0.4: if block_weight > page_w * 0.4:
line_height = (y1 - y0) / 3 line_height = (y1 - y0) / 3
lines = 3 lines = 3
elif block_weight > page_w * 0.25: # 否则将block分成两行 elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点)
line_height = (y1 - y0) / 2 lines = int(block_height / line_height) + 1
lines = 2
else: # 判断长宽比 else: # 判断长宽比
if block_height / block_weight > 1.2: # 细长的不分 if block_height / block_weight > 1.2: # 细长的不分
return [[x0, y0, x1, y1]] return [[x0, y0, x1, y1]]
...@@ -250,7 +239,11 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h): ...@@ -250,7 +239,11 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list = [] page_line_list = []
for block in fix_blocks: for block in fix_blocks:
if block['type'] in ['text', 'title', 'interline_equation']: if block['type'] in [
BlockType.Text, BlockType.Title, BlockType.InterlineEquation,
BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableCaption, BlockType.TableFootnote
]:
if len(block['lines']) == 0: if len(block['lines']) == 0:
bbox = block['bbox'] bbox = block['bbox']
lines = insert_lines_into_block(bbox, line_height, page_w, page_h) lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
...@@ -261,8 +254,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -261,8 +254,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
for line in block['lines']: for line in block['lines']:
bbox = line['bbox'] bbox = line['bbox']
page_line_list.append(bbox) page_line_list.append(bbox)
elif block['type'] in ['table', 'image']: elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
bbox = block['bbox'] bbox = block['bbox']
block["real_lines"] = copy.deepcopy(block['lines'])
lines = insert_lines_into_block(bbox, line_height, page_w, page_h) lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
block['lines'] = [] block['lines'] = []
for line in lines: for line in lines:
...@@ -316,7 +310,11 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -316,7 +310,11 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
def get_line_height(blocks): def get_line_height(blocks):
page_line_height_list = [] page_line_height_list = []
for block in blocks: for block in blocks:
if block['type'] in ['text', 'title', 'interline_equation']: if block['type'] in [
BlockType.Text, BlockType.Title,
BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableCaption, BlockType.TableFootnote
]:
for line in block['lines']: for line in block['lines']:
bbox = line['bbox'] bbox = line['bbox']
page_line_height_list.append(int(bbox[3] - bbox[1])) page_line_height_list.append(int(bbox[3] - bbox[1]))
...@@ -326,6 +324,63 @@ def get_line_height(blocks): ...@@ -326,6 +324,63 @@ def get_line_height(blocks):
return 10 return 10
def process_groups(groups, body_key, caption_key, footnote_key):
body_blocks = []
caption_blocks = []
footnote_blocks = []
for i, group in enumerate(groups):
group[body_key]['group_id'] = i
body_blocks.append(group[body_key])
for caption_block in group[caption_key]:
caption_block['group_id'] = i
caption_blocks.append(caption_block)
for footnote_block in group[footnote_key]:
footnote_block['group_id'] = i
footnote_blocks.append(footnote_block)
return body_blocks, caption_blocks, footnote_blocks
def process_block_list(blocks, body_type, block_type):
indices = [block['index'] for block in blocks]
median_index = statistics.median(indices)
body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), [])
return {
'type': block_type,
'bbox': body_bbox,
'blocks': blocks,
'index': median_index,
}
def revert_group_blocks(blocks):
image_groups = {}
table_groups = {}
new_blocks = []
for block in blocks:
if block['type'] in [BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote]:
group_id = block['group_id']
if group_id not in image_groups:
image_groups[group_id] = []
image_groups[group_id].append(block)
elif block['type'] in [BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote]:
group_id = block['group_id']
if group_id not in table_groups:
table_groups[group_id] = []
table_groups[group_id].append(block)
else:
new_blocks.append(block)
for group_id, blocks in image_groups.items():
new_blocks.append(process_block_list(blocks, BlockType.ImageBody, BlockType.Image))
for group_id, blocks in table_groups.items():
new_blocks.append(process_block_list(blocks, BlockType.TableBody, BlockType.Table))
return new_blocks
def parse_page_core( def parse_page_core(
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
): ):
...@@ -333,8 +388,20 @@ def parse_page_core( ...@@ -333,8 +388,20 @@ def parse_page_core(
drop_reason = [] drop_reason = []
"""从magic_model对象中获取后面会用到的区块信息""" """从magic_model对象中获取后面会用到的区块信息"""
img_blocks = magic_model.get_imgs(page_id) # img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id) # table_blocks = magic_model.get_tables(page_id)
img_groups = magic_model.get_imgs_v2(page_id)
table_groups = magic_model.get_tables_v2(page_id)
img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
)
table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
)
discarded_blocks = magic_model.get_discarded(page_id) discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = magic_model.get_text_blocks(page_id) text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id) title_blocks = magic_model.get_title_blocks(page_id)
...@@ -370,8 +437,8 @@ def parse_page_core( ...@@ -370,8 +437,8 @@ def parse_page_core(
interline_equation_blocks = [] interline_equation_blocks = []
if len(interline_equation_blocks) > 0: if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, discarded_blocks,
text_blocks, text_blocks,
title_blocks, title_blocks,
...@@ -381,8 +448,8 @@ def parse_page_core( ...@@ -381,8 +448,8 @@ def parse_page_core(
) )
else: else:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_blocks, img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, discarded_blocks,
text_blocks, text_blocks,
title_blocks, title_blocks,
...@@ -419,7 +486,7 @@ def parse_page_core( ...@@ -419,7 +486,7 @@ def parse_page_core(
block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5) block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
"""对block进行fix操作""" """对block进行fix操作"""
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks) fix_blocks = fix_block_spans_v2(block_with_spans)
"""获取所有line并计算正文line的高度""" """获取所有line并计算正文line的高度"""
line_height = get_line_height(fix_blocks) line_height = get_line_height(fix_blocks)
...@@ -430,6 +497,9 @@ def parse_page_core( ...@@ -430,6 +497,9 @@ def parse_page_core(
"""根据line的中位数算block的序列关系""" """根据line的中位数算block的序列关系"""
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes) fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
"""将image和table的block还原回group形式参与后续流程"""
fix_blocks = revert_group_blocks(fix_blocks)
"""重排block""" """重排block"""
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index']) sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
......
...@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
return all_bboxes, all_discarded_blocks, drop_reasons return all_bboxes, all_discarded_blocks, drop_reasons
def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_blocks, text_blocks, def add_bboxes(blocks, block_type, bboxes):
title_blocks, interline_equation_blocks, page_w, page_h): for block in blocks:
x0, y0, x1, y1 = block['bbox']
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
else:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
def ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
):
all_bboxes = [] all_bboxes = []
all_discarded_blocks = []
for image in img_blocks:
x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
for table in table_blocks: add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
x0, y0, x1, y1 = table['bbox'] add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]]) add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
for text in text_blocks: add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
x0, y0, x1, y1 = text['bbox'] add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]]) add_bboxes(text_blocks, BlockType.Text, all_bboxes)
add_bboxes(title_blocks, BlockType.Title, all_bboxes)
for title in title_blocks: add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
x0, y0, x1, y1 = title['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
for interline_equation in interline_equation_blocks:
x0, y0, x1, y1 = interline_equation['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
'''block嵌套问题解决''' '''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框''' '''文本框与标题框重叠,优先信任文本框'''
...@@ -96,12 +101,14 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b ...@@ -96,12 +101,14 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框''' '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除 # 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' '''discarded_blocks'''
all_discarded_blocks = []
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
footnote_blocks = [] footnote_blocks = []
for discarded in discarded_blocks: for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox'] x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
# 将footnote加入到all_bboxes中,用来计算layout
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2): if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
footnote_blocks.append([x0, y0, x1, y1]) footnote_blocks.append([x0, y0, x1, y1])
......
...@@ -49,7 +49,7 @@ def merge_spans_to_line(spans): ...@@ -49,7 +49,7 @@ def merge_spans_to_line(spans):
continue continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.6): if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5):
current_line.append(span) current_line.append(span)
else: else:
# 否则,开始新行 # 否则,开始新行
...@@ -153,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -153,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
'type': block_type, 'type': block_type,
'bbox': block_bbox, 'bbox': block_bbox,
} }
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
block_dict["group_id"] = block[-1]
block_spans = [] block_spans = []
for span in spans: for span in spans:
span_bbox = span['bbox'] span_bbox = span['bbox']
...@@ -201,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks): ...@@ -201,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return fix_blocks return fix_blocks
def fix_block_spans_v2(block_with_spans):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type in [BlockType.Text, BlockType.Title,
BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableCaption, BlockType.TableFootnote
]:
block = fix_text_block(block)
elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
block = fix_interline_block(block)
else:
continue
fix_blocks.append(block)
return fix_blocks
def fix_discarded_block(discarded_block_with_spans): def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = [] fix_discarded_blocks = []
for block in discarded_block_with_spans: for block in discarded_block_with_spans:
......
...@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""", ...@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""",
help=""" help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional. Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url: You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""", """,
default=None, default=None,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment