Commit 2df265c8 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
parent 826086d2
from collections import defaultdict
from magic_pdf.libs.boxbase import calculate_iou
def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
def is_single_line_block(block):
# Determine based on the width and height of the block
block_width = block["X1"] - block["X0"]
block_height = block["bbox"][3] - block["bbox"][1]
# If the height of the block is close to the average character height and the width is large, it is considered a single line
return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
"""
This function gets the most common bboxes from the bboxes
Parameters
----------
bboxes : list
bboxes
page_height : float
height of the page
position : str, optional
"top" or "bottom", by default "top"
threshold : float, optional
threshold, by default 0.25
num_bboxes : int, optional
number of bboxes to return, by default 3
min_frequency : int, optional
minimum frequency of the bbox, by default 2
Returns
-------
common_bboxes : list
common bboxes
"""
# Filter bbox by position
if position == "top":
filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
else:
filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
# Find the most common bbox
bbox_count = defaultdict(int)
for bbox in filtered_bboxes:
bbox_count[tuple(bbox)] += 1
# Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
common_bboxes = [
bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
][:num_bboxes]
return common_bboxes
def detect_footer_header2(result_dict, similarity_threshold=0.5):
"""
This function detects the header and footer of the document.
Parameters
----------
result_dict : dict
result dictionary
Returns
-------
result_dict : dict
result dictionary
"""
# Traverse all blocks in the document
single_line_blocks = 0
total_blocks = 0
single_line_blocks = 0
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
total_blocks += 1
if is_single_line_block(block):
single_line_blocks += 1
# If there are no blocks, skip the header and footer detection
if total_blocks == 0:
print("No blocks found. Skipping header/footer detection.")
return result_dict
# If most of the blocks are single-line, skip the header and footer detection
if single_line_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
# print("Skipping header/footer detection for text-dense document.")
return result_dict
# Collect the bounding boxes of all blocks
all_bboxes = []
all_texts = []
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
all_bboxes.append(block["bbox"])
# Get the height of the page
page_height = max(bbox[3] for bbox in all_bboxes)
# Get the most common bbox lists for headers and footers
common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
# Detect and mark headers and footers
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
bbox = block["bbox"]
text = block["text"]
is_header = compare_bbox_with_list(bbox, common_header_bboxes)
is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
block["is_header"] = int(is_header)
block["is_footer"] = int(is_footer)
return result_dict
def __get_page_size(page_sizes:list):
"""
页面大小可能不一样
"""
w = sum([w for w,h in page_sizes])/len(page_sizes)
h = sum([h for w,h in page_sizes])/len(page_sizes)
return w, h
def __calculate_iou(bbox1, bbox2):
iou = calculate_iou(bbox1, bbox2)
return iou
def __is_same_pos(box1, box2, iou_threshold):
iou = __calculate_iou(box1, box2)
return iou >= iou_threshold
def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9):
"""
common bbox必须大于page_cnt的1/3
"""
min_occurance_cnt = max(3, page_cnt//4)
header_det_bbox = []
footer_det_bbox = []
hdr_same_pos_group = []
btn_same_pos_group = []
page_w, page_h = __get_page_size(page_size)
top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
top_bbox = [b for b in bboxes if b[3]<top_y]
bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
# 然后开始排序,寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
for i in range(0, len(top_bbox)):
hdr_same_pos_group.append([top_bbox[i]])
for j in range(i+1, len(top_bbox)):
if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
#header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
hdr_same_pos_group[i].append(top_bbox[j])
for i in range(0, len(bottom_bbox)):
btn_same_pos_group.append([bottom_bbox[i]])
for j in range(i+1, len(bottom_bbox)):
if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
#footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
btn_same_pos_group[i].append(bottom_bbox[j])
# 然后看下每一组的bbox,是否符合大于page_cnt一定比例
hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
# 平铺2个list[list]
hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
# 寻找hdr_same_pos_group中的box[3]最大值,btn_same_pos_group中的box[1]最小值
hdr_same_pos_group.sort(key=lambda b:b[3])
btn_same_pos_group.sort(key=lambda b:b[1])
hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
header_det_bbox = [0, 0, page_w, hdr_y]
footer_det_bbox = [0, btn_y, page_w, page_h]
# logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
return header_det_bbox, footer_det_bbox, page_w, page_h
def drop_footer_header(pdf_info_dict:dict):
"""
启用规则探测,在全局的视角上通过统计的方法。
"""
header = []
footer = []
all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
page_cnt = len(pdf_info_dict.keys()) # 一共多少页
header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
""""
把范围扩展到页面水平的整个方向上
"""
if header:
header = [0, 0, page_w, header[3]+1]
if footer:
footer = [0, footer[1]-1, page_w, page_h]
# 找到footer, header范围之后,针对每一页pdf,从text、图片中删除这些范围内的内容
# 移除text block
for _, page_info in pdf_info_dict.items():
header_text_blk = []
footer_text_blk = []
for blk in page_info['preproc_blocks']:
blk_bbox = blk['bbox']
if header and blk_bbox[3]<=header[3]:
blk['tag'] = "header"
header_text_blk.append(blk)
elif footer and blk_bbox[1]>=footer[1]:
blk['tag'] = "footer"
footer_text_blk.append(blk)
# 放入text_block_droped中
page_info['droped_text_block'].extend(header_text_blk)
page_info['droped_text_block'].extend(footer_text_blk)
for blk in header_text_blk:
page_info['preproc_blocks'].remove(blk)
for blk in footer_text_blk:
page_info['preproc_blocks'].remove(blk)
"""接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
header_image = []
footer_image = []
for image_info in page_info['images']:
img_bbox = image_info['bbox']
if header and img_bbox[3]<=header[3]:
image_info['tag'] = "header"
header_image.append(image_info)
elif footer and img_bbox[1]>=footer[1]:
image_info['tag'] = "footer"
footer_image.append(image_info)
page_info['droped_image_block'].extend(header_image)
page_info['droped_image_block'].extend(footer_image)
for img in header_image:
page_info['images'].remove(img)
for img in footer_image:
page_info['images'].remove(img)
"""接下来吧backup的图片也删除掉"""
header_image = []
footer_image = []
for image_info in page_info['image_backup']:
img_bbox = image_info['bbox']
if header and img_bbox[3]<=header[3]:
image_info['tag'] = "header"
header_image.append(image_info)
elif footer and img_bbox[1]>=footer[1]:
image_info['tag'] = "footer"
footer_image.append(image_info)
page_info['droped_image_block'].extend(header_image)
page_info['droped_image_block'].extend(footer_image)
for img in header_image:
page_info['image_backup'].remove(img)
for img in footer_image:
page_info['image_backup'].remove(img)
return header, footer
from collections import Counter
from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
#--------- 通过json_from_DocXchain来获取 footnote ---------#
footnote_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
# {0: 'title', # 标题
# 1: 'figure', # 图片
# 2: 'plain text', # 文本
# 3: 'header', # 页眉
# 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
# 7: 'table', # 表格
# 8: 'table caption', # 表格描述
# 9: 'figure caption', # 图片描述
# 10: 'equation', # 公式
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
# 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']:
L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
# if xf['category_id'] == 5 and xf['score'] >= 0.3:
if xf['category_id'] == 5 and xf['score'] >= 0.43: # 新的footnote阈值
footnote_bbox_from_DocXChain.append((L, U, R, D))
footnote_final_names = []
footnote_final_bboxs = []
footnote_ID = 0
for L, U, R, D in footnote_bbox_from_DocXChain:
if debug_mode:
# cur_footnote = page.get_pixmap(clip=(L,U,R,D))
new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID) # 脚注name
# cur_footnote.save(md_bookname_save_path + '/' + new_footnote_name) # 把脚注存储在新建的文件夹,并命名
footnote_final_names.append(new_footnote_name) # 把脚注的名字存在list中
footnote_final_bboxs.append((L, U, R, D))
footnote_ID += 1
footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
curPage_all_footnote_bboxs = footnote_final_bboxs
return curPage_all_footnote_bboxs
def need_remove(block):
if 'lines' in block and len(block['lines']) > 0:
# block中只有一行,且该行文本全是大写字母,或字体为粗体bold关键词,SB关键词,把这个block捞回来
if len(block['lines']) == 1:
if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1:
font_keywords = ['SB', 'bold', 'Bold']
if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords):
return True
for line in block['lines']:
if 'spans' in line and len(line['spans']) > 0:
for span in line['spans']:
# 检测"keyword"是否在span中,忽略大小写
if "keyword" in span['text'].lower():
return True
return False
def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font):
"""
根据给定的文本块、页高和页码,解析出符合规则的脚注文本块,并返回其边界框。
Args:
remain_text_blocks (list): 包含所有待处理的文本块的列表。
page_height (float): 页面的高度。
page_id (int): 页面的ID。
Returns:
list: 符合规则的脚注文本块的边界框列表。
"""
# if page_id > 20:
if page_id > 2: # 为保证精确度,先只筛选前3页
return []
else:
# 存储每一行的文本块大小的列表
line_sizes = []
# 存储每个文本块的平均行大小
block_sizes = []
# 存储每一行的字体信息
# font_names = []
font_names = Counter()
if len(remain_text_blocks) > 0:
for block in remain_text_blocks:
block_line_sizes = []
# block_fonts = []
block_fonts = Counter()
for line in block['lines']:
# 提取每个span的size属性,并计算行大小
span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
if span_sizes:
line_size = sum(span_sizes) / len(span_sizes)
line_sizes.append(line_size)
block_line_sizes.append(line_size)
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
if span_font:
# main_text_font应该用基于字数最多的字体而不是span级别的统计
# font_names.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font)
for font, count in span_font:
# font_names.extend([font] * count)
# block_fonts.extend([font] * count)
font_names[font] += count
block_fonts[font] += count
if block_line_sizes:
# 计算文本块的平均行大小
block_size = sum(block_line_sizes) / len(block_line_sizes)
# block_font = collections.Counter(block_fonts).most_common(1)[0][0]
block_font = block_fonts.most_common(1)[0][0]
block_sizes.append((block, block_size, block_font))
# 计算main_text_size
main_text_size = Counter(line_sizes).most_common(1)[0][0]
# 计算main_text_font
# main_text_font = collections.Counter(font_names).most_common(1)[0][0]
# main_text_font = font_names.most_common(1)[0][0]
# 删除一些可能被误识别为脚注的文本块
block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)]
# 检测footnote_block 并返回 footnote_bboxes
# footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
# block['bbox'][1] > page_height * 0.6 and block_size < main_text_size
# and (len(block['lines']) < 5 or block_font != main_text_font)]
# and len(block['lines']) < 5]
footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
block['bbox'][1] > page_height * 0.6 and
# 较为严格的规则
block_size < main_text_size and
(len(block['lines']) < 5 or
block_font != main_text_font)]
# 较为宽松的规则
# sum([block_size < main_text_size,
# len(block['lines']) < 5,
# block_font != main_text_font])
# >= 2]
return footnote_bboxes
else:
return []
from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
#--------- 通过json_from_DocXchain来获取 header ---------#
header_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
# {0: 'title', # 标题
# 1: 'figure', # 图片
# 2: 'plain text', # 文本
# 3: 'header', # 页眉
# 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
# 7: 'table', # 表格
# 8: 'table caption', # 表格描述
# 9: 'figure caption', # 图片描述
# 10: 'equation', # 公式
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
# 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']:
L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
if xf['category_id'] == 3 and xf['score'] >= 0.3:
header_bbox_from_DocXChain.append((L, U, R, D))
header_final_names = []
header_final_bboxs = []
header_ID = 0
for L, U, R, D in header_bbox_from_DocXChain:
# cur_header = page.get_pixmap(clip=(L,U,R,D))
new_header_name = "header_{}_{}.png".format(page_ID, header_ID) # 页眉name
# cur_header.save(res_dir_path + '/' + new_header_name) # 把页眉存储在新建的文件夹,并命名
header_final_names.append(new_header_name) # 把页面的名字存在list中
header_final_bboxs.append((L, U, R, D))
header_ID += 1
header_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
curPage_all_header_bboxs = header_final_bboxs
return curPage_all_header_bboxs
import collections # 统计库
import re
from magic_pdf.libs.commons import fitz # pyMuPDF库
#--------------------------------------- Tool Functions --------------------------------------#
# 正则化,输入文本,输出只保留a-z,A-Z,0-9
def remove_special_chars(s: str) -> str:
pattern = r"[^a-zA-Z0-9]"
res = re.sub(pattern, "", s)
return res
def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
# 判断rect1和rect2是否一模一样
return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
# 判断rect1包含了rect2
return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
# 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠)
return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
# 计算两个rect,重叠面积各占2个rect面积的比例
if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
return 0, 0
square_1 = (R1 - L1) * (D1 - U1)
square_2 = (R2 - L2) * (D2 - U2)
if square_1 == 0 or square_2 == 0:
return 0, 0
square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
return square_overlap / square_1, square_overlap / square_2
def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
# 计算两个line,重叠区间各占2个line长度的比例
if max(L1, L2) > min(R1, R2):
return 0, 0
if L1 == R1 or L2 == R2:
return 0, 0
overlap_line = min(R1, R2) - max(L1, L2)
return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
# 判断rect其实是一条line
def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
width = R - L
height = D - U
if width <= 3 or height <= 3:
return True
if width / height >= 30 or height / width >= 30:
return True
def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
#### 通过fitz获取page信息
## 超越边界
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#----------------- 保存每一个文本块的LURD ------------------#
textLine_blocks = []
blocks = page.get_text(
"dict",
flags=fitz.TEXTFLAGS_TEXT,
#clip=clip,
)["blocks"]
for i in range(len(blocks)):
bbox = blocks[i]['bbox']
# print(bbox)
for tt in blocks[i]['lines']:
# 当前line
cur_line_bbox = None # 当前line,最右侧的section的bbox
for xf in tt['spans']:
L, U, R, D = xf['bbox']
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
textLine_blocks.append((L, U, R, D))
textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
#---------------------------------------------- 保存img --------------------------------------------------#
raw_imgs = page.get_images() # 获取所有的图片
imgs = []
img_names = [] # 保存图片的名字,方便在md中插入引用
img_bboxs = [] # 保存图片的location信息。
img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了
img_ID = 0
## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标)
for i in range(len(raw_imgs)):
# 如果图片在junklist中则跳过
if raw_imgs[i][0] in junk_img_bojids:
continue
else:
try:
tt = page.get_image_rects(raw_imgs[i][0], transform = True)
rec = tt[0][0]
L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
continue
if pageL == L and R == pageR:
continue
if pageU == U and D == pageD:
continue
# pix1 = page.get_Pixmap(clip=(L,U,R,D))
new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
# pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名
img_names.append(new_img_name)
img_bboxs.append((L, U, R, D))
img_visited.append(False)
imgs.append(raw_imgs[i])
except:
continue
#-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------#
imgs_ok = [True for _ in range(len(imgs))]
for i in range(len(imgs)):
L1, U1, R1, D1 = img_bboxs[i]
for j in range(i + 1, len(imgs)):
L2, U2, R2, D2 = img_bboxs[j]
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
if ratio_1 > 0 and ratio_2 > 0:
if ratio_1 == 1 and ratio_2 > 0.8:
imgs_ok[i] = False
elif ratio_1 > 0.8 and ratio_2 == 1:
imgs_ok[j] = False
elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
imgs_ok[i] = False
imgs_ok[j] = False
elif s1 / s2 > 5 and ratio_2 > 0.5:
imgs_ok[j] = False
elif s2 / s1 > 5 and ratio_1 > 0.5:
imgs_ok[i] = False
imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
#*******************************************************************************#
#---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
#
svgs = page.get_drawings()
#------------ preprocess, check一些大框,看是否是合理的 ----------#
## 去重。有时候会遇到rect1和rect2是完全一样的情形。
svg_rect_visited = set()
available_svgIdx = []
for i in range(len(svgs)):
L, U, R, D = svgs[i]['rect'].irect
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
tt = (L, U, R, D)
if tt not in svg_rect_visited:
svg_rect_visited.add(tt)
available_svgIdx.append(i)
svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs
svg_childs = [[] for _ in range(len(svgs))]
svg_parents = [[] for _ in range(len(svgs))]
svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
svg_visited = [False for _ in range(len(svgs))]
svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。
for i in range(len(svgs)):
L, U, R, D = svgs[i]['rect'].irect
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
if ratio_2 >= 0.7:
svg_exceedPage[i] += 4
else:
if L <= pageL:
svg_exceedPage[i] += 1
if pageR <= R:
svg_exceedPage[i] += 1
if U <= pageU:
svg_exceedPage[i] += 1
if pageD <= D:
svg_exceedPage[i] += 1
#### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。
if len([x for x in svg_exceedPage if x >= 1]) >= 2:
svgs = []
svg_childs = []
svg_parents = []
svg_overlaps = []
svg_visited = []
svg_exceedPage = []
#---------------------------- build graph ----------------------------#
for i, p in enumerate(svgs):
L1, U1, R1, D1 = svgs[i]["rect"].irect
for j in range(len(svgs)):
if i == j:
continue
L2, U2, R2, D2 = svgs[j]["rect"].irect
## 包含
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
svg_childs[i].append(j)
svg_parents[j].append(i)
else:
## 交叉
if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
svg_overlaps[i].append(j)
#---------------- 确定最终的svg。连通块儿的外围 -------------------#
eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准)
svg_ID = 0
svg_final_names = []
svg_final_bboxs = []
svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited
svg_idxs = [i for i in range(len(svgs))]
svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序
for i in svg_idxs:
if svg_visited[i] == True:
continue
svg_visited[i] = True
L, U, R, D = svgs[i]['rect'].irect
width = R - L
height = D - U
if check_rect_isLine(L, U, R, D) == True:
svg_visited[i] = False
continue
# if i == 4:
# print(i, L, U, R, D)
# print(svg_parents[i])
cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。
if len(svg_parents[i]) == 0:
## 是个普通框的情形
cur_block_element_cnt += len(svg_childs[i])
if svg_exceedPage[i] == 0:
## 误差。可能已经包含在某个框里面了
neglect_flag = False
for pL, pU, pR, pD in svg_final_bboxs:
if pL <= L <= R <= pR and pU <= U <= D <= pD:
neglect_flag = True
break
if neglect_flag == True:
continue
## 搜索连通域, bfs+记忆化
q = collections.deque()
for j in svg_overlaps[i]:
q.append(j)
while q:
j = q.popleft()
svg_visited[j] = True
L2, U2, R2, D2 = svgs[j]['rect'].irect
# width2 = R2 - L2
# height2 = D2 - U2
# if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
# continue
L = min(L, L2)
R = max(R, R2)
U = min(U, U2)
D = max(D, D2)
cur_block_element_cnt += 1
cur_block_element_cnt += len(svg_childs[j])
for k in svg_overlaps[j]:
if svg_visited[k] == False and svg_exceedPage[k] == 0:
svg_visited[k] = True
q.append(k)
elif svg_exceedPage[i] <= 2:
## 误差。可能已经包含在某个svg_final_bbox框里面了
neglect_flag = False
for sL, sU, sR, sD in svg_final_bboxs:
if sL <= L <= R <= sR and sU <= U <= D <= sD:
neglect_flag = True
break
if neglect_flag == True:
continue
L, U, R, D = pageR, pageD, pageL, pageU
## 所有孩子元素的最大边界
for j in svg_childs[i]:
if svg_visited[j] == True:
continue
if svg_exceedPage[j] >= 1:
continue
svg_visited[j] = True #### 这个位置考虑一下
L2, U2, R2, D2 = svgs[j]['rect'].irect
L = min(L, L2)
R = max(R, R2)
U = min(U, U2)
D = max(D, D2)
cur_block_element_cnt += 1
# 如果是条line,就不用保存了
if check_rect_isLine(L, U, R, D) == True:
continue
# 如果当前的svg,连2个elements都没有,就不用保存了
if cur_block_element_cnt < 3:
continue
## 当前svg,框住了多少文本框。如果框多了,可能就是错了
contain_textLineBlock_cnt = 0
for L2, U2, R2, D2 in textLine_blocks:
if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
contain_textLineBlock_cnt += 1
if contain_textLineBlock_cnt >= 10:
continue
# L -= eps_ERROR * 2
# U -= eps_ERROR
# R += eps_ERROR * 2
# D += eps_ERROR
# # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
# cur_svg = page.get_pixmap(clip=(L,U,R,D))
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
svg_final_bboxs.append((L, U, R, D))
svg_final_visited.append(False)
svg_ID += 1
## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并
svg_idxs = [i for i in range(len(svg_final_bboxs))]
svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
svg_final_names_2 = []
svg_final_bboxs_2 = []
svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited
svg_ID_2 = 0
for i in range(len(svg_final_bboxs)):
L1, U1, R1, D1 = svg_final_bboxs[i]
for j in range(i + 1, len(svg_final_bboxs)):
L2, U2, R2, D2 = svg_final_bboxs[j]
# 如果 rect1包含了rect2
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
svg_final_visited[j] = True
continue
# 水平并列
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
if ratio_1 >= 0.7 and ratio_2 >= 0.7:
if abs(L2 - R1) >= 20:
continue
LL = min(L1, L2)
UU = min(U1, U2)
RR = max(R1, R2)
DD = max(D1, D2)
svg_final_bboxs[i] = (LL, UU, RR, DD)
svg_final_visited[j] = True
continue
# 竖直并列
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
if ratio_1 >= 0.7 and ratio_2 >= 0.7:
if abs(U2 - D1) >= 20:
continue
LL = min(L1, L2)
UU = min(U1, U2)
RR = max(R1, R2)
DD = max(D1, D2)
svg_final_bboxs[i] = (LL, UU, RR, DD)
svg_final_visited[j] = True
for i in range(len(svg_final_bboxs)):
if svg_final_visited[i] == False:
L, U, R, D = svg_final_bboxs[i]
svg_final_bboxs_2.append((L, U, R, D))
L -= eps_ERROR * 2
U -= eps_ERROR
R += eps_ERROR * 2
D += eps_ERROR
# cur_svg = page.get_pixmap(clip=(L,U,R,D))
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
svg_final_bboxs_2.append((L, U, R, D))
svg_final_visited_2.append(False)
svg_ID_2 += 1
## svg收尾。识别为drawing,但是在上面没有拼成一张图的。
# 有收尾才comprehensive
# xxxx
# xxxx
# xxxx
# xxxx
#--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------#
figure_bbox_from_DocXChain = []
figure_from_DocXChain_visited = [] # 记忆化
figure_bbox_from_DocXChain_overlappedRatio = []
figure_only_from_DocXChain_bboxs = [] # 存储
figure_only_from_DocXChain_names = []
figure_only_from_DocXChain_visited = []
figure_only_ID = 0
xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width']
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
for xf in xf_json['layout_dets']:
# {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
L = xf['poly'][0] / LR_scaleRatio
U = xf['poly'][1] / UD_scaleRatio
R = xf['poly'][2] / LR_scaleRatio
D = xf['poly'][5] / UD_scaleRatio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
# figure
if xf["category_id"] == 1 and xf['score'] >= 0.3:
figure_bbox_from_DocXChain.append((L, U, R, D))
figure_from_DocXChain_visited.append(False)
figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
#---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
## 比对imgs
for i, b1 in enumerate(figure_bbox_from_DocXChain):
# print('--------- DocXChain的图片', b1)
L1, U1, R1, D1 = b1
for b2 in img_bboxs:
# print('-------- igms得到的图', b2)
L2, U2, R2, D2 = b2
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
# 相同
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
figure_from_DocXChain_visited[i] = True
# 包含
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
if s2 / s1 > 0.8:
figure_from_DocXChain_visited[i] = True
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
if s1 / s2 > 0.8:
figure_from_DocXChain_visited[i] = True
else:
# 重叠了相当一部分
# print('进入第3部分')
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
figure_from_DocXChain_visited[i] = True
else:
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
# print('图片的重叠率是{}'.format(ratio_1))
## 比对svgs
svg_final_bboxs_2_badIdxs = []
for i, b1 in enumerate(figure_bbox_from_DocXChain):
L1, U1, R1, D1 = b1
for j, b2 in enumerate(svg_final_bboxs_2):
L2, U2, R2, D2 = b2
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
# 相同
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
figure_from_DocXChain_visited[i] = True
# 包含
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
figure_from_DocXChain_visited[i] = True
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
if s1 / s2 > 0.7:
figure_from_DocXChain_visited[i] = True
else:
svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
else:
# 重叠了相当一部分
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
figure_from_DocXChain_visited[i] = True
else:
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
# 丢掉错误的svg
svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
for i in range(len(figure_from_DocXChain_visited)):
if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
figure_from_DocXChain_visited[i] = True
# DocXChain识别出来的figure,但是没被保存的。
for i in range(len(figure_from_DocXChain_visited)):
if figure_from_DocXChain_visited[i] == False:
figure_from_DocXChain_visited[i] = True
cur_bbox = figure_bbox_from_DocXChain[i]
# cur_figure = page.get_pixmap(clip=cur_bbox)
new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
# cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名
figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用
figure_only_from_DocXChain_bboxs.append(cur_bbox)
figure_only_from_DocXChain_visited.append(False)
figure_only_ID += 1
img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
#--------------------------- 最后统一去重 -----------------------------------#
curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
#### 先考虑包含关系的小块
final_duplicate = set()
for i in range(len(curPage_all_fig_bboxs)):
L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
for j in range(len(curPage_all_fig_bboxs)):
if i == j:
continue
L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
final_duplicate.add((L1, U1, R1, D1))
else:
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
if ratio_1 >= 0.8 and ratio_2 <= 0.6:
final_duplicate.add((L1, U1, R1, D1))
curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
#### 再考虑重叠关系的块
final_duplicate = set()
final_synthetic_bboxs = []
for i in range(len(curPage_all_fig_bboxs)):
L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
for j in range(len(curPage_all_fig_bboxs)):
if i == j:
continue
L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
union_ok = False
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
union_ok = True
if (ratio_1 > 0.2 and s2 / s1 > 5):
union_ok = True
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
union_ok = True
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
union_ok = True
if union_ok == True:
final_duplicate.add((L1, U1, R1, D1))
final_duplicate.add((L2, U2, R2, D2))
L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
final_synthetic_bboxs.append((L3, U3, R3, D3))
# print('---------- curPage_all_fig_bboxs ---------')
# print(curPage_all_fig_bboxs)
curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
final_synthetic_bboxs = list(set(final_synthetic_bboxs))
## 再再考虑重叠关系。极端情况下会迭代式地2进1
new_images = []
droped_img_idx = []
image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
for i in range(0, len(image_bboxes)):
for j in range(i+1, len(image_bboxes)):
if j not in droped_img_idx:
L2, U2, R2, D2 = image_bboxes[j]
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
union_ok = False
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
union_ok = True
if (ratio_1 > 0.2 and s2 / s1 > 5):
union_ok = True
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
union_ok = True
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
union_ok = True
if union_ok == True:
# 合并
image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
droped_img_idx.append(j)
for i in range(0, len(image_bboxes)):
if i not in droped_img_idx:
new_images.append(image_bboxes[i])
# find_union_FLAG = True
# while find_union_FLAG == True:
# find_union_FLAG = False
# final_duplicate = set()
# tmp = []
# for i in range(len(final_synthetic_bboxs)):
# L1, U1, R1, D1 = final_synthetic_bboxs[i]
# for j in range(len(final_synthetic_bboxs)):
# if i == j:
# continue
# L2, U2, R2, D2 = final_synthetic_bboxs[j]
# s1 = abs(R1 - L1) * abs(D1 - U1)
# s2 = abs(R2 - L2) * abs(D2 - U2)
# ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
# union_ok = False
# if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
# union_ok = True
# if (ratio_1 > 0.2 and s2 / s1 > 5):
# union_ok = True
# if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
# union_ok = True
# if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
# union_ok = True
# if union_ok == True:
# find_union_FLAG = True
# final_duplicate.add((L1, U1, R1, D1))
# final_duplicate.add((L2, U2, R2, D2))
# L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
# tmp.append((L3, U3, R3, D3))
# if find_union_FLAG == True:
# tmp = list(set(tmp))
# final_synthetic_bboxs = tmp[:]
# curPage_all_fig_bboxs += final_synthetic_bboxs
# print('--------- final synthetic')
# print(final_synthetic_bboxs)
#**************************************************************************#
images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
images = images1 + new_images
return images
from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
#--------- 通过json_from_DocXchain来获取 pageNo ---------#
pageNo_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
# {0: 'title', # 标题
# 1: 'figure', # 图片
# 2: 'plain text', # 文本
# 3: 'header', # 页眉
# 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
# 7: 'table', # 表格
# 8: 'table caption', # 表格描述
# 9: 'figure caption', # 图片描述
# 10: 'equation', # 公式
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
# 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']:
L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
if xf['category_id'] == 4 and xf['score'] >= 0.3:
pageNo_bbox_from_DocXChain.append((L, U, R, D))
pageNo_final_names = []
pageNo_final_bboxs = []
pageNo_ID = 0
for L, U, R, D in pageNo_bbox_from_DocXChain:
# cur_pageNo = page.get_pixmap(clip=(L,U,R,D))
new_pageNo_name = "pageNo_{}_{}.png".format(page_ID, pageNo_ID) # 页码name
# cur_pageNo.save(res_dir_path + '/' + new_pageNo_name) # 把页码存储在新建的文件夹,并命名
pageNo_final_names.append(new_pageNo_name) # 把页码的名字存在list中
pageNo_final_bboxs.append((L, U, R, D))
pageNo_ID += 1
pageNo_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
curPage_all_pageNo_bboxs = pageNo_final_bboxs
return curPage_all_pageNo_bboxs
from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#--------- 通过json_from_DocXchain来获取 table ---------#
table_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width']
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
for xf in xf_json['layout_dets']:
# {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
# 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式
L = xf['poly'][0] / LR_scaleRatio
U = xf['poly'][1] / UD_scaleRatio
R = xf['poly'][2] / LR_scaleRatio
D = xf['poly'][5] / UD_scaleRatio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
if xf['category_id'] == 7 and xf['score'] >= 0.3:
table_bbox_from_DocXChain.append((L, U, R, D))
table_final_names = []
table_final_bboxs = []
table_ID = 0
for L, U, R, D in table_bbox_from_DocXChain:
# cur_table = page.get_pixmap(clip=(L,U,R,D))
new_table_name = "table_{}_{}.png".format(page_ID, table_ID) # 表格name
# cur_table.save(res_dir_path + '/' + new_table_name) # 把表格存出在新建的文件夹,并命名
table_final_names.append(new_table_name) # 把表格的名字存在list中,方便在md中插入引用
table_final_bboxs.append((L, U, R, D))
table_ID += 1
table_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
curPage_all_table_bboxs = table_final_bboxs
return curPage_all_table_bboxs
"""
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
"""
from magic_pdf.libs.commons import fitz
import json
import os
from pathlib import Path
from loguru import logger
from magic_pdf.libs.ocr_content_type import ContentType
TYPE_INLINE_EQUATION = ContentType.InlineEquation
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
def combine_chars_to_pymudict(block_dict, char_dict):
"""
把block级别的pymupdf 结构里加入char结构
"""
# 因为block_dict 被裁剪过,因此先把他和char_dict文字块对齐,才能进行补充
char_map = {tuple(item["bbox"]): item for item in char_dict}
for i in range(len(block_dict)): # blcok
block = block_dict[i]
key = block["bbox"]
char_dict_item = char_map[tuple(key)]
char_dict_map = {tuple(item["bbox"]): item for item in char_dict_item["lines"]}
for j in range(len(block["lines"])):
lines = block["lines"][j]
with_char_lines = char_dict_map[lines["bbox"]]
for k in range(len(lines["spans"])):
spans = lines["spans"][k]
try:
chars = with_char_lines["spans"][k]["chars"]
except Exception as e:
logger.error(char_dict[i]["lines"][j])
spans["chars"] = chars
return block_dict
def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
"""
计算box1和box2的重叠面积占最小面积的box的比例
"""
# Determine the coordinates of the intersection rectangle
x_left = max(bbox1[0], min_bbox[0])
y_top = max(bbox1[1], min_bbox[1])
x_right = min(bbox1[2], min_bbox[2])
y_bottom = min(bbox1[3], min_bbox[3])
if x_right < x_left or y_bottom < y_top:
return 0.0
# The area of overlap area
intersection_area = (x_right - x_left) * (y_bottom - y_top)
min_box_area = (min_bbox[3] - min_bbox[1]) * (min_bbox[2] - min_bbox[0])
if min_box_area == 0:
return 0
else:
return intersection_area / min_box_area
def _is_xin(bbox1, bbox2):
area1 = abs(bbox1[2] - bbox1[0]) * abs(bbox1[3] - bbox1[1])
area2 = abs(bbox2[2] - bbox2[0]) * abs(bbox2[3] - bbox2[1])
if area1 < area2:
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1)
else:
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
return ratio > 0.6
def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
"""消除掉整个块都在行间公式块内部的文本块"""
for eq_bbox in interline_bboxes:
removed_txt_blk = []
for text_blk in text_blocks:
text_bbox = text_blk["bbox"]
if (
calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], text_bbox)
>= 0.7
):
removed_txt_blk.append(text_blk)
for blk in removed_txt_blk:
text_blocks.remove(blk)
return text_blocks
def _is_in_or_part_overlap(box1, box2) -> bool:
"""
两个bbox是否有部分重叠或者包含
"""
if box1 is None or box2 is None:
return False
x0_1, y0_1, x1_1, y1_1 = box1
x0_2, y0_2, x1_2, y1_2 = box2
return not (
x1_1 < x0_2 # box1在box2的左边
or x0_1 > x1_2 # box1在box2的右边
or y1_1 < y0_2 # box1在box2的上边
or y0_1 > y1_2
) # box1在box2的下边
def remove_text_block_overlap_interline_equation_bbox(
interline_eq_bboxes, pymu_block_list
):
"""消除掉行行内公式有部分重叠的文本块的内容。
同时重新计算消除重叠之后文本块的大小"""
deleted_block = []
for text_block in pymu_block_list:
deleted_line = []
for line in text_block["lines"]:
deleted_span = []
for span in line["spans"]:
deleted_chars = []
for char in span["chars"]:
if any(
[
(calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
for eq_bbox in interline_eq_bboxes
]
):
deleted_chars.append(char)
# 检查span里没有char则删除这个span
for char in deleted_chars:
span["chars"].remove(char)
# 重新计算这个span的大小
if len(span["chars"]) == 0: # 删除这个span
deleted_span.append(span)
else:
span["bbox"] = (
min([b["bbox"][0] for b in span["chars"]]),
min([b["bbox"][1] for b in span["chars"]]),
max([b["bbox"][2] for b in span["chars"]]),
max([b["bbox"][3] for b in span["chars"]]),
)
# 检查这个span
for span in deleted_span:
line["spans"].remove(span)
if len(line["spans"]) == 0: # 删除这个line
deleted_line.append(line)
else:
line["bbox"] = (
min([b["bbox"][0] for b in line["spans"]]),
min([b["bbox"][1] for b in line["spans"]]),
max([b["bbox"][2] for b in line["spans"]]),
max([b["bbox"][3] for b in line["spans"]]),
)
# 检查这个block是否可以删除
for line in deleted_line:
text_block["lines"].remove(line)
if len(text_block["lines"]) == 0: # 删除block
deleted_block.append(text_block)
else:
text_block["bbox"] = (
min([b["bbox"][0] for b in text_block["lines"]]),
min([b["bbox"][1] for b in text_block["lines"]]),
max([b["bbox"][2] for b in text_block["lines"]]),
max([b["bbox"][3] for b in text_block["lines"]]),
)
# 检查text block删除
for block in deleted_block:
pymu_block_list.remove(block)
if len(pymu_block_list) == 0:
return []
return pymu_block_list
def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
"""在行间公式对应的地方插上一个伪造的block"""
for eq in interline_eq_bboxes:
bbox = eq["bbox"]
latex_content = eq["latex"]
text_block = {
"number": len(pymu_block_list),
"type": 0,
"bbox": bbox,
"lines": [
{
"spans": [
{
"size": 9.962599754333496,
"type": TYPE_INTERLINE_EQUATION,
"flags": 4,
"font": TYPE_INTERLINE_EQUATION,
"color": 0,
"ascender": 0.9409999847412109,
"descender": -0.3050000071525574,
"latex": latex_content,
"origin": [bbox[0], bbox[1]],
"bbox": bbox,
}
],
"wmode": 0,
"dir": [1.0, 0.0],
"bbox": bbox,
}
],
}
pymu_block_list.append(text_block)
def x_overlap_ratio(box1, box2):
a, _, c, _ = box1
e, _, g, _ = box2
# 计算重叠宽度
overlap_x = max(min(c, g) - max(a, e), 0)
# 计算box1的宽度
width1 = g - e
# 计算重叠比例
overlap_ratio = overlap_x / width1 if width1 != 0 else 0
return overlap_ratio
def __is_x_dir_overlap(bbox1, bbox2):
return not (bbox1[2] < bbox2[0] or bbox1[0] > bbox2[2])
def __y_overlap_ratio(box1, box2):
""""""
_, b, _, d = box1
_, f, _, h = box2
# 计算重叠高度
overlap_y = max(min(d, h) - max(b, f), 0)
# 计算box1的高度
height1 = d - b
# 计算重叠比例
overlap_ratio = overlap_y / height1 if height1 != 0 else 0
return overlap_ratio
def replace_line_v2(eqinfo, line):
"""
扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
最后与这个x0,x1有相交的span0, span1内部进行分割。
"""
first_overlap_span = -1
first_overlap_span_idx = -1
last_overlap_span = -1
delete_chars = []
for i in range(0, len(line["spans"])):
if "chars" not in line["spans"][i]:
continue
if line["spans"][i].get("_type", None) is not None:
continue # 忽略,因为已经是插入的伪造span公式了
for char in line["spans"][i]["chars"]:
if __is_x_dir_overlap(eqinfo["bbox"], char["bbox"]):
line_txt = ""
for span in line["spans"]:
span_txt = "<span>"
for ch in span["chars"]:
span_txt = span_txt + ch["c"]
span_txt = span_txt + "</span>"
line_txt = line_txt + span_txt
if first_overlap_span_idx == -1:
first_overlap_span = line["spans"][i]
first_overlap_span_idx = i
last_overlap_span = line["spans"][i]
delete_chars.append(char)
# 第一个和最后一个char要进行检查,到底属于公式多还是属于正常span多
if len(delete_chars) > 0:
ch0_bbox = delete_chars[0]["bbox"]
if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
delete_chars.remove(delete_chars[0])
if len(delete_chars) > 0:
ch0_bbox = delete_chars[-1]["bbox"]
if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
delete_chars.remove(delete_chars[-1])
# 计算x方向上被删除区间内的char的真实x0, x1
if len(delete_chars):
x0, x1 = min([b["bbox"][0] for b in delete_chars]), max(
[b["bbox"][2] for b in delete_chars]
)
else:
# logger.debug(f"行内公式替换没有发生,尝试下一行匹配, eqinfo={eqinfo}")
return False
# 删除位于x0, x1这两个中间的span
delete_span = []
for span in line["spans"]:
span_box = span["bbox"]
if x0 <= span_box[0] and span_box[2] <= x1:
delete_span.append(span)
for span in delete_span:
line["spans"].remove(span)
equation_span = {
"size": 9.962599754333496,
"type": TYPE_INLINE_EQUATION,
"flags": 4,
"font": TYPE_INLINE_EQUATION,
"color": 0,
"ascender": 0.9409999847412109,
"descender": -0.3050000071525574,
"latex": "",
"origin": [337.1410153102337, 216.0205245153934],
"bbox": eqinfo["bbox"]
}
# equation_span = line['spans'][0].copy()
equation_span["latex"] = eqinfo['latex']
equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]]
equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]]
equation_span["chars"] = delete_chars
equation_span["type"] = TYPE_INLINE_EQUATION
equation_span["_eq_bbox"] = eqinfo["bbox"]
line["spans"].insert(first_overlap_span_idx + 1, equation_span) # 放入公式
# logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
# 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
first_span_chars = [
char
for char in first_overlap_span["chars"]
if (char["bbox"][2] + char["bbox"][0]) / 2 < x0
]
tail_span_chars = [
char
for char in last_overlap_span["chars"]
if (char["bbox"][0] + char["bbox"][2]) / 2 > x1
]
if len(first_span_chars) > 0:
first_overlap_span["chars"] = first_span_chars
first_overlap_span["text"] = "".join([char["c"] for char in first_span_chars])
first_overlap_span["bbox"] = (
first_overlap_span["bbox"][0],
first_overlap_span["bbox"][1],
max([chr["bbox"][2] for chr in first_span_chars]),
first_overlap_span["bbox"][3],
)
# first_overlap_span['_type'] = "first"
else:
# 删掉
if first_overlap_span not in delete_span:
line["spans"].remove(first_overlap_span)
if len(tail_span_chars) > 0:
min_of_tail_span_x0 = min([chr["bbox"][0] for chr in tail_span_chars])
min_of_tail_span_y0 = min([chr["bbox"][1] for chr in tail_span_chars])
max_of_tail_span_x1 = max([chr["bbox"][2] for chr in tail_span_chars])
max_of_tail_span_y1 = max([chr["bbox"][3] for chr in tail_span_chars])
if last_overlap_span == first_overlap_span: # 这个时候应该插入一个新的
tail_span_txt = "".join([char["c"] for char in tail_span_chars])
last_span_to_insert = last_overlap_span.copy()
last_span_to_insert["chars"] = tail_span_chars
last_span_to_insert["text"] = "".join(
[char["c"] for char in tail_span_chars]
)
if equation_span["bbox"][2] >= last_overlap_span["bbox"][2]:
last_span_to_insert["bbox"] = (
min_of_tail_span_x0,
min_of_tail_span_y0,
max_of_tail_span_x1,
max_of_tail_span_y1
)
else:
last_span_to_insert["bbox"] = (
min([chr["bbox"][0] for chr in tail_span_chars]),
last_overlap_span["bbox"][1],
last_overlap_span["bbox"][2],
last_overlap_span["bbox"][3],
)
# 插入到公式对象之后
equation_idx = line["spans"].index(equation_span)
line["spans"].insert(equation_idx + 1, last_span_to_insert) # 放入公式
else: # 直接修改原来的span
last_overlap_span["chars"] = tail_span_chars
last_overlap_span["text"] = "".join([char["c"] for char in tail_span_chars])
last_overlap_span["bbox"] = (
min([chr["bbox"][0] for chr in tail_span_chars]),
last_overlap_span["bbox"][1],
last_overlap_span["bbox"][2],
last_overlap_span["bbox"][3],
)
else:
# 删掉
if (
last_overlap_span not in delete_span
and last_overlap_span != first_overlap_span
):
line["spans"].remove(last_overlap_span)
remain_txt = ""
for span in line["spans"]:
span_txt = "<span>"
for char in span["chars"]:
span_txt = span_txt + char["c"]
span_txt = span_txt + "</span>"
remain_txt = remain_txt + span_txt
# logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】")
return True
def replace_eq_blk(eqinfo, text_block):
"""替换行内公式"""
for line in text_block["lines"]:
line_bbox = line["bbox"]
if (
_is_xin(eqinfo["bbox"], line_bbox)
or __y_overlap_ratio(eqinfo["bbox"], line_bbox) > 0.6
): # 定位到行, 使用y方向重合率是因为有的时候,一个行的宽度会小于公式位置宽度:行很高,公式很窄,
replace_succ = replace_line_v2(eqinfo, line)
if (
not replace_succ
): # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
continue
else:
break
else:
return False
return True
def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
"""替换行内公式"""
for eqinfo in inline_equation_bboxes:
eqbox = eqinfo["bbox"]
for blk in raw_text_blocks:
if _is_xin(eqbox, blk["bbox"]):
if not replace_eq_blk(eqinfo, blk):
logger.warning(f"行内公式没有替换成功:{eqinfo} ")
else:
break
return raw_text_blocks
def remove_chars_in_text_blocks(text_blocks):
"""删除text_blocks里的char"""
for blk in text_blocks:
for line in blk["lines"]:
for span in line["spans"]:
_ = span.pop("chars", "no such key")
return text_blocks
def replace_equations_in_textblock(
raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes
):
"""
替换行间和和行内公式为latex
"""
raw_text_blocks = remove_text_block_in_interline_equation_bbox(
interline_equation_bboxes, raw_text_blocks
) # 消除重叠:第一步,在公式内部的
raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
interline_equation_bboxes, raw_text_blocks
) # 消重,第二步,和公式覆盖的
insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
return raw_text_blocks
def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
""" """
new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf"
with open(json_path, "r", encoding="utf-8") as f:
obj = json.loads(f.read())
if os.path.exists(new_pdf):
os.remove(new_pdf)
new_doc = fitz.open("")
doc = fitz.open(pdf_path)
new_doc = fitz.open(pdf_path)
for i in range(len(new_doc)):
page = new_doc[i]
inline_equation_bboxes = obj[f"page_{i}"]["inline_equations"]
interline_equation_bboxes = obj[f"page_{i}"]["interline_equations"]
raw_text_blocks = obj[f"page_{i}"]["preproc_blocks"]
raw_text_blocks = remove_text_block_in_interline_equation_bbox(
interline_equation_bboxes, raw_text_blocks
) # 消除重叠:第一步,在公式内部的
raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
interline_equation_bboxes, raw_text_blocks
) # 消重,第二步,和公式覆盖的
insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
raw_text_blocks = replace_inline_equations(
inline_equation_bboxes, raw_text_blocks
)
# 为了检验公式是否重复,把每一行里,含有公式的span背景改成黄色的
color_map = [fitz.pdfcolor["blue"], fitz.pdfcolor["green"]]
j = 0
for blk in raw_text_blocks:
for i, line in enumerate(blk["lines"]):
# line_box = line['bbox']
# shape = page.new_shape()
# shape.draw_rect(line_box)
# shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3)
# shape.commit()
# j = j+1
for i, span in enumerate(line["spans"]):
shape_page = page.new_shape()
span_type = span.get("_type")
color = fitz.pdfcolor["blue"]
if span_type == "first":
color = fitz.pdfcolor["blue"]
elif span_type == "tail":
color = fitz.pdfcolor["green"]
elif span_type == TYPE_INLINE_EQUATION:
color = fitz.pdfcolor["black"]
else:
color = None
b = span["bbox"]
shape_page.draw_rect(b)
shape_page.finish(color=None, fill=color, fill_opacity=0.3)
shape_page.commit()
new_doc.save(new_pdf)
logger.info(f"save ok {new_pdf}")
final_json = json.dumps(obj, ensure_ascii=False, indent=2)
with open("equations_test/final_json.json", "w") as f:
f.write(final_json)
return new_pdf
if __name__ == "__main__":
# draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
pass
import re
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
from magic_pdf.libs.textbase import get_text_block_base_info
def fix_image_vertical(image_bboxes:list, text_blocks:list):
"""
修正图片的位置
如果图片与文字block发生一定重叠(也就是图片切到了一部分文字),那么减少图片边缘,让文字和图片不再重叠。
只对垂直方向进行。
"""
for image_bbox in image_bboxes:
for text_block in text_blocks:
text_bbox = text_block["bbox"]
if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
if text_bbox[1] < image_bbox[1]:#在图片上方
image_bbox[1] = text_bbox[3]+1
elif text_bbox[3]>image_bbox[3]:#在图片下方
image_bbox[3] = text_bbox[1]-1
return image_bboxes
def __merge_if_common_edge(bbox1, bbox2):
x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
# 检查是否有公共的水平边
if y_min_1 == y_min_2 or y_max_1 == y_max_2:
# 确保一个框的x范围在另一个框的x范围内
if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
# 检查是否有公共的垂直边
if x_min_1 == x_min_2 or x_max_1 == x_max_2:
# 确保一个框的y范围在另一个框的y范围内
if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
# 如果没有公共边
return None
def fix_seperated_image(image_bboxes:list):
"""
如果2个图片有一个边重叠,那么合并2个图片
"""
new_images = []
droped_img_idx = []
for i in range(0, len(image_bboxes)):
for j in range(i+1, len(image_bboxes)):
new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
if new_img is not None:
new_images.append(new_img)
droped_img_idx.append(i)
droped_img_idx.append(j)
break
for i in range(0, len(image_bboxes)):
if i not in droped_img_idx:
new_images.append(image_bboxes[i])
return new_images
def __check_img_title_pattern(text):
"""
检查文本段是否是表格的标题
"""
patterns = [r"^(fig|figure).*", r"^(scheme).*"]
text = text.strip()
for pattern in patterns:
match = re.match(pattern, text, re.IGNORECASE)
if match:
return True
return False
def __get_fig_caption_text(text_block):
txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
line_cnt = len(text_block['lines'])
txt = txt.replace("Ž . ", '')
return txt, line_cnt
def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
"""
继续向下方寻找和图片caption字号,字体,颜色一样的文字框,合并入caption。
text_block是已经找到的图片catpion(这个caption可能不全,多行被划分到多个pymu block里了)
"""
combined_image_caption_text_block = list(text_block.copy()['bbox'])
base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
while True:
tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
if not tb_add:
break
tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
combined_image_caption_text_block[3] = tb_add['bbox'][3]
else:
break
image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
text_block['_image_caption'] = True
def include_img_title(pymu_blocks, image_bboxes: list):
"""
向上方和下方寻找符合图片title的文本block,合并到图片里
如果图片上下都有fig的情况怎么办?寻找标题距离最近的那个。
---
增加对左侧和右侧图片标题的寻找
"""
for tb in image_bboxes:
# 优先找下方的
max_find_cnt = 3 # 向上,向下最多找3个就停止
temp_box = tb.copy()
while max_find_cnt>0:
text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_btn:
txt, line_cnt = __get_fig_caption_text(text_block_btn)
if len(txt.strip())>0:
if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题,或者有时候图片下方文字没有被图片识别模型放入图片里
max_find_cnt = max_find_cnt - 1
temp_box[3] = text_block_btn['bbox'][3]
continue
else:
break
else:
temp_box[3] = text_block_btn['bbox'][3] # 宽度不变,扩大
max_find_cnt = max_find_cnt - 1
else:
break
max_find_cnt = 3 # 向上,向下最多找3个就停止
temp_box = tb.copy()
while max_find_cnt>0:
text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_top:
txt, line_cnt = __get_fig_caption_text(text_block_top)
if len(txt.strip())>0:
if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
max_find_cnt = max_find_cnt - 1
temp_box[1] = text_block_top['bbox'][1]
continue
else:
break
else:
b = text_block_top['bbox']
temp_box[1] = b[1] # 宽度不变,扩大
max_find_cnt = max_find_cnt - 1
else:
break
if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
btn_text, _ = __get_fig_caption_text(text_block_btn)
top_text, _ = __get_fig_caption_text(text_block_top)
if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
# 取距离图片最近的
btn_text_distance = text_block_btn['bbox'][1] - tb[3]
top_text_distance = tb[1] - text_block_top['bbox'][3]
if btn_text_distance<top_text_distance: # caption在下方
__find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
else:
text_block = text_block_top
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block_btn['_image_caption'] = True
continue
text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
if text_block and text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(text_block)
if __check_img_title_pattern(first_text_line):
# 发现特征之后,继续向相同方向寻找(想同颜色,想同大小,想同字体)的textblock
__find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
continue
text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
if text_block and text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(text_block)
if __check_img_title_pattern(first_text_line):
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block['_image_caption'] = True
continue
"""向左、向右寻找,暂时只寻找一次"""
left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
if left_text_block and left_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(left_text_block)
if __check_img_title_pattern(first_text_line):
tb[0] = min(tb[0], left_text_block['bbox'][0])
tb[1] = min(tb[1], left_text_block['bbox'][1])
tb[2] = max(tb[2], left_text_block['bbox'][2])
tb[3] = max(tb[3], left_text_block['bbox'][3])
left_text_block['_image_caption'] = True
continue
right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
if right_text_block and right_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(right_text_block)
if __check_img_title_pattern(first_text_line):
tb[0] = min(tb[0], right_text_block['bbox'][0])
tb[1] = min(tb[1], right_text_block['bbox'][1])
tb[2] = max(tb[2], right_text_block['bbox'][2])
tb[3] = max(tb[3], right_text_block['bbox'][3])
right_text_block['_image_caption'] = True
continue
return image_bboxes
def combine_images(image_bboxes:list):
"""
合并图片,如果图片有重叠,那么合并
"""
new_images = []
droped_img_idx = []
for i in range(0, len(image_bboxes)):
for j in range(i+1, len(image_bboxes)):
if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
# 合并
image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
droped_img_idx.append(j)
for i in range(0, len(image_bboxes)):
if i not in droped_img_idx:
new_images.append(image_bboxes[i])
return new_images
\ No newline at end of file
from magic_pdf.libs.commons import fitz # pyMuPDF库
import re
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
## version 2
def get_merged_line(page):
"""
这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线,并且将断开的线段进行了合并。
:param page :fitz读取的当前页的内容
"""
drawings_bbox = []
drawings_line = []
drawings = page.get_drawings() # 提取所有的矢量
for p in drawings:
drawings_bbox.append(p["rect"].irect) # (L, U, R, D)
lines = []
for L, U, R, D in drawings_bbox:
if abs(D - U) <= 3: # 筛出水平的横线
lines.append((L, U, R, D))
U_groups = []
visited = [False for _ in range(len(lines))]
for i, (L1, U1, R1, D1) in enumerate(lines):
if visited[i] == True:
continue
tmp_g = [(L1, U1, R1, D1)]
for j, (L2, U2, R2, D2) in enumerate(lines):
if i == j:
continue
if visited[j] == True:
continue
if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5: # 把高度一致的线放进一个group
tmp_g.append((L2, U2, R2, D2))
visited[j] = True
U_groups.append(tmp_g)
res = []
for group in U_groups:
group.sort(key = lambda LURD: (LURD[0], LURD[2]))
LL, UU, RR, DD = group[0]
for i, (L1, U1, R1, D1) in enumerate(group):
if (L1 - RR) >= 5:
cur_line = (LL, UU, RR, DD)
res.append(cur_line)
LL = L1
else:
RR = max(RR, R1)
cur_line = (LL, UU, RR, DD)
res.append(cur_line)
return res
def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
"""
:param page :fitz读取的当前页的内容
:param table_bboxes: list类型,每一个元素是一个元祖 (L, U, R, D)
:param include_table_title: 是否将表格的标题也圈进来
:param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
"""
drawings_lines = get_merged_line(page)
fix_table_bboxes = []
for table in table_bboxes:
(L, U, R, D) = table
fix_table_L = []
fix_table_U = []
fix_table_R = []
fix_table_D = []
width = R - L
width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
height = D - U
height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
for line in drawings_lines:
if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
if (U - height_range) < line[1] < (U + height_range): # 上边界,在一定的高度范围内
fix_table_U.append(line[1])
fix_table_L.append(line[0])
fix_table_R.append(line[2])
elif (D - height_range) < line[1] < (D + height_range): # 下边界,在一定的高度范围内
fix_table_D.append(line[1])
fix_table_L.append(line[0])
fix_table_R.append(line[2])
if fix_table_U:
U = min(fix_table_U)
if fix_table_D:
D = max(fix_table_D)
if fix_table_L:
L = min(fix_table_L)
if fix_table_R:
R = max(fix_table_R)
if include_table_title: # 需要将表格标题包括
text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] # 所有的text的block
incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))] # 将与表格完全没有任何遮挡的文字筛除掉(比如另一栏的文字)
upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0] # 将在表格线以上的text block筛选出来
sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序,如果是同一个高度,则先左再右
for idx in range(scan_line_num):
if idx+1 <= len(sorted_filtered_text_blocks):
line_temp = sorted_filtered_text_blocks[idx]['lines']
if line_temp:
text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
check_en = re.match('Table', text) # 检查是否有Table开头的(英文)
check_ch = re.match('表', text) # 检查是否有Table开头的(中文)
if check_en or check_ch:
if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
U = sorted_filtered_text_blocks[idx]['bbox'][1]
fix_table_bboxes.append([L-2, U-2, R+2, D+2])
return fix_table_bboxes
def __check_table_title_pattern(text):
"""
检查文本段是否是表格的标题
"""
patterns = [r'^table\s\d+']
for pattern in patterns:
match = re.match(pattern, text, re.IGNORECASE)
if match:
return True
else:
return False
def fix_table_text_block(pymu_blocks, table_bboxes: list):
"""
调整table, 如果table和上下的text block有相交区域,则将table的上下边界调整到text block的上下边界
例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
"""
for tb in table_bboxes:
(L, U, R, D) = tb
for block in pymu_blocks:
if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title,那么不调整。因为下一步会统一调整,如果这里进行了调整,后面的调整会造成调整到其他table的title上(在连续出现2个table的情况下)。
tb[0] = min(tb[0], block['bbox'][0])
tb[1] = min(tb[1], block['bbox'][1])
tb[2] = max(tb[2], block['bbox'][2])
tb[3] = max(tb[3], block['bbox'][3])
block['_table'] = True # 占位,防止其他table再次占用
"""如果是个table的title,但是有部分重叠,那么修正这个title,使得和table不重叠"""
if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
block['bbox'] = list(block['bbox'])
if block['bbox'][3] > U:
block['bbox'][3] = U-1
if block['bbox'][1] < D:
block['bbox'][1] = D+1
return table_bboxes
def __get_table_caption_text(text_block):
txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
line_cnt = len(text_block['lines'])
txt = txt.replace("Ž . ", '')
return txt, line_cnt
def include_table_title(pymu_blocks, table_bboxes: list):
"""
把表格的title也包含进来,扩展到table_bbox上
"""
for tb in table_bboxes:
max_find_cnt = 3 # 上上最多找3次
temp_box = tb.copy()
while max_find_cnt>0:
text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_top:
txt, line_cnt = __get_table_caption_text(text_block_top)
if len(txt.strip())>0:
if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
max_find_cnt = max_find_cnt -1
temp_box[1] = text_block_top['bbox'][1]
continue
else:
break
else:
temp_box[1] = text_block_top['bbox'][1] # 宽度不变,扩大
max_find_cnt = max_find_cnt - 1
else:
break
max_find_cnt = 3 # 向下找
temp_box = tb.copy()
while max_find_cnt>0:
text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_bottom:
txt, line_cnt = __get_table_caption_text(text_block_bottom)
if len(txt.strip())>0:
if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
max_find_cnt = max_find_cnt - 1
temp_box[3] = text_block_bottom['bbox'][3]
continue
else:
break
else:
temp_box[3] = text_block_bottom['bbox'][3]
max_find_cnt = max_find_cnt - 1
else:
break
if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
btn_text, _ = __get_table_caption_text(text_block_bottom)
top_text, _ = __get_table_caption_text(text_block_top)
if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
# 取距离最近的
btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
top_text_distance = tb[1] - text_block_top['bbox'][3]
text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block_bottom['_table_caption'] = True
continue
# 如果以上条件都不满足,那么就向下找
text_block = text_block_top
if text_block and text_block.get("_table_caption", False) is False:
first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block['_table_caption'] = True
continue
text_block = text_block_bottom
if text_block and text_block.get("_table_caption", False) is False:
first_text_line, _ = __get_table_caption_text(text_block)
if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block['_table_caption'] = True
continue
"""向左、向右寻找,暂时只寻找一次"""
left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
if left_text_block and left_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_table_caption_text(left_text_block)
if __check_table_title_pattern(first_text_line):
tb[0] = min(tb[0], left_text_block['bbox'][0])
tb[1] = min(tb[1], left_text_block['bbox'][1])
tb[2] = max(tb[2], left_text_block['bbox'][2])
tb[3] = max(tb[3], left_text_block['bbox'][3])
left_text_block['_image_caption'] = True
continue
right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
if right_text_block and right_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_table_caption_text(right_text_block)
if __check_table_title_pattern(first_text_line):
tb[0] = min(tb[0], right_text_block['bbox'][0])
tb[1] = min(tb[1], right_text_block['bbox'][1])
tb[2] = max(tb[2], right_text_block['bbox'][2])
tb[3] = max(tb[3], right_text_block['bbox'][3])
right_text_block['_image_caption'] = True
continue
return table_bboxes
\ No newline at end of file
import collections
def get_main_text_font(pdf_docs):
font_names = collections.Counter()
for page in pdf_docs:
blocks = page.get_text('dict')['blocks']
if blocks is not None:
for block in blocks:
lines = block.get('lines')
if lines is not None:
for line in lines:
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
'font' in span and len(span['text']) > 0]
if span_font:
# main_text_font应该用基于字数最多的字体而不是span级别的统计
# font_names.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font)
for font, count in span_font:
font_names[font] += count
main_text_font = font_names.most_common(1)[0][0]
return main_text_font
from loguru import logger
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
calculate_iou, calculate_vertical_projection_overlap_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
title_blocks, interline_equation_blocks, page_w, page_h):
all_bboxes = []
all_discarded_blocks = []
for image in img_blocks:
x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
for table in table_blocks:
x0, y0, x1, y1 = table['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
for text in text_blocks:
x0, y0, x1, y1 = text['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
for title in title_blocks:
x0, y0, x1, y1 = title['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
for interline_equation in interline_equation_blocks:
x0, y0, x1, y1 = interline_equation['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
# 将footnote加入到all_bboxes中,用来计算layout
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks, drop_reasons
def add_bboxes(blocks, block_type, bboxes):
for block in blocks:
x0, y0, x1, y1 = block['bbox']
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
else:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
def ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
):
all_bboxes = []
add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
add_bboxes(text_blocks, BlockType.Text, all_bboxes)
add_bboxes(title_blocks, BlockType.Title, all_bboxes)
add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除
'''discarded_blocks'''
all_discarded_blocks = []
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
footnote_blocks = []
for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox']
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
footnote_blocks.append([x0, y0, x1, y1])
'''移除在footnote下面的任何框'''
need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
if len(need_remove_blocks) > 0:
for block in need_remove_blocks:
all_bboxes.remove(block)
all_discarded_blocks.append(block)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks
def find_blocks_under_footnote(all_bboxes, footnote_blocks):
need_remove_blocks = []
for block in all_bboxes:
block_x0, block_y0, block_x1, block_y1 = block[:4]
for footnote_bbox in footnote_blocks:
footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
if block not in need_remove_blocks:
need_remove_blocks.append(block)
break
return need_remove_blocks
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
# 先提取所有text和interline block
text_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Text:
text_blocks.append(block)
interline_equation_blocks = []
for block in all_bboxes:
if block[7] == BlockType.InterlineEquation:
interline_equation_blocks.append(block)
need_remove = []
for interline_equation_block in interline_equation_blocks:
for text_block in text_blocks:
interline_equation_block_bbox = interline_equation_block[:4]
text_block_bbox = text_block[:4]
if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
if text_block not in need_remove:
need_remove.append(text_block)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def fix_text_overlap_title_blocks(all_bboxes):
# 先提取所有text和title block
text_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Text:
text_blocks.append(block)
title_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Title:
title_blocks.append(block)
need_remove = []
for text_block in text_blocks:
for title_block in title_blocks:
text_block_bbox = text_block[:4]
title_block_bbox = title_block[:4]
if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
if title_block not in need_remove:
need_remove.append(title_block)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def remove_need_drop_blocks(all_bboxes, discarded_blocks):
need_remove = []
for block in all_bboxes:
for discarded_block in discarded_blocks:
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
if block not in need_remove:
need_remove.append(block)
break
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def remove_overlaps_min_blocks(all_bboxes):
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
need_remove = []
for block1 in all_bboxes:
for block2 in all_bboxes:
if block1 != block2:
block1_bbox = block1[:4]
block2_bbox = block2[:4]
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
if overlap_box is not None:
block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
if block_to_remove is not None and block_to_remove not in need_remove:
large_block = block1 if block1 != block_to_remove else block2
x1, y1, x2, y2 = large_block[:4]
sx1, sy1, sx2, sy2 = block_to_remove[:4]
x1 = min(x1, sx1)
y1 = min(y1, sy1)
x2 = max(x2, sx2)
y2 = max(y2, sy2)
large_block[:4] = [x1, y1, x2, y2]
need_remove.append(block_to_remove)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
import fitz
from magic_pdf.layout.layout_sort import get_bboxes_layout
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def get_center_point(bbox):
"""
根据边界框坐标信息,计算出该边界框的中心点坐标。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
"""
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
def get_area(bbox):
"""
根据边界框坐标信息,计算出该边界框的面积。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
float: 该边界框的面积。
"""
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
def adjust_layouts(layout_bboxes, page_boundry, page_id):
# 遍历所有布局框
for i in range(len(layout_bboxes)):
# 遍历当前布局框之后的布局框
for j in range(i + 1, len(layout_bboxes)):
# 判断两个布局框是否重叠
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
# 计算每个布局框的中心点坐标和面积
area_i = get_area(layout_bboxes[i])
area_j = get_area(layout_bboxes[j])
# 较大布局框和较小布局框的赋值
if area_i > area_j:
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
else:
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
center_large = get_center_point(larger_layout)
center_small = get_center_point(smaller_layout)
# 计算横向和纵向的距离差
distance_x = center_large[0] - center_small[0]
distance_y = center_large[1] - center_small[1]
# 根据距离差判断重叠方向并修正边界
if abs(distance_x) > abs(distance_y): # 左右重叠
if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
larger_layout[0] = smaller_layout[2]+1
if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
larger_layout[2] = smaller_layout[0]-1
else: # 上下重叠
if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
larger_layout[1] = smaller_layout[3]+1
if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
larger_layout[3] = smaller_layout[1]-1
# 排序调整布局边界框列表
new_bboxes = []
for layout_bbox in layout_bboxes:
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes, layout_tree
def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
Args:
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
Returns:
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
page_id = ocr_page_info['page_info']['page_no']-1
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
# 初始化布局边界框列表
layout_bboxes = []
# 遍历每个子布局
for sub_layout in layout_info:
# 提取子布局的边界框坐标信息
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
# 将子布局的边界框添加到列表中
layout_bboxes.append(bbox)
# 初始化新的布局边界框列表
new_layout_bboxes = []
# 遍历每个布局边界框
for i in range(len(layout_bboxes)):
# 初始化标记变量,用于判断当前边界框是否需要保留
keep = True
# 获取当前边界框的坐标信息
box_i = layout_bboxes[i]
# 遍历其他边界框
for j in range(len(layout_bboxes)):
# 排除当前边界框自身
if i != j:
# 获取其他边界框的坐标信息
box_j = layout_bboxes[j]
# 检测box_i是否被box_j包含
if _is_in(box_i, box_j):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
keep = False
# 跳出内层循环
break
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
if keep:
new_layout_bboxes.append(layout_bboxes[i])
# 对新的布局边界框列表进行排序调整
page_width = page.rect.width
page_height = page.rect.height
page_boundry = [0, 0, page_width, page_height]
layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes, layout_tree
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
_is_in_or_part_overlap_with_area_ratio,
calculate_overlap_area_in_bbox1_area_ratio)
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
line_objects.append({
'bbox': line_bbox,
'spans': line,
})
return line_objects
def merge_spans_to_line(spans):
if len(spans) == 0:
return []
else:
# 按照y0坐标排序
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
for span in spans[1:]:
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
] or any(s['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
] for s in current_line):
# 则开始新行
lines.append(current_line)
current_line = [span]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5):
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
current_line = [span]
# 添加最后一行
if current_line:
lines.append(current_line)
return lines
def merge_spans_to_line_by_layout(spans, layout_bboxes):
lines = []
new_spans = []
dropped_spans = []
for item in layout_bboxes:
layout_bbox = item['layout_bbox']
# 遍历spans,将每个span放入对应的layout中
layout_sapns = []
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(
span['bbox'], layout_bbox) > 0.6:
layout_sapns.append(span)
# 如果layout_sapns不为空,则放入new_spans中
if len(layout_sapns) > 0:
new_spans.append(layout_sapns)
# 从spans删除已经放入layout_sapns中的span
for layout_sapn in layout_sapns:
spans.remove(layout_sapn)
if len(new_spans) > 0:
for layout_sapns in new_spans:
layout_lines = merge_spans_to_line(layout_sapns)
lines.extend(layout_lines)
# 对line中的span进行排序
lines = line_sort_spans_by_left_to_right(lines)
for span in spans:
span['tag'] = DropTag.NOT_IN_LAYOUT
dropped_spans.append(span)
return lines, dropped_spans
def merge_lines_to_block(lines):
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = []
for line in lines:
blocks.append({
'bbox': line['bbox'],
'lines': [line],
})
return blocks
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
new_blocks = []
sort_blocks = []
for item in layout_bboxes:
layout_bbox = item['layout_bbox']
# 遍历blocks,将每个blocks放入对应的layout中
layout_blocks = []
for block in all_bboxes:
# 如果是footnote则跳过
if block[7] == BlockType.Footnote:
continue
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(
block_bbox, layout_bbox) > 0.8:
layout_blocks.append(block)
# 如果layout_blocks不为空,则放入new_blocks中
if len(layout_blocks) > 0:
new_blocks.append(layout_blocks)
# 从all_bboxes删除已经放入layout_blocks中的block
for layout_block in layout_blocks:
all_bboxes.remove(layout_block)
# 如果new_blocks不为空,则对new_blocks中每个block进行排序
if len(new_blocks) > 0:
for bboxes_in_layout_block in new_blocks:
bboxes_in_layout_block.sort(
key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
sort_blocks.extend(bboxes_in_layout_block)
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
return sort_blocks
def fill_spans_in_blocks(blocks, spans, radio):
"""将allspans中的span按位置关系,放入blocks中."""
block_with_spans = []
for block in blocks:
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'type': block_type,
'bbox': block_bbox,
}
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
block_dict["group_id"] = block[-1]
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(
span_bbox, block_bbox) > radio:
block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
# displayed_list = []
# text_inline_lines = []
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans, spans
def fix_block_spans(block_with_spans, img_blocks, table_blocks):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type == BlockType.Image:
block = fix_image_block(block, img_blocks)
elif block_type == BlockType.Table:
block = fix_table_block(block, table_blocks)
elif block_type in [BlockType.Text, BlockType.Title]:
block = fix_text_block(block)
elif block_type == BlockType.InterlineEquation:
block = fix_interline_block(block)
else:
continue
fix_blocks.append(block)
return fix_blocks
def fix_block_spans_v2(block_with_spans):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type in [BlockType.Text, BlockType.Title,
BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableCaption, BlockType.TableFootnote
]:
block = fix_text_block(block)
elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
block = fix_interline_block(block)
else:
continue
fix_blocks.append(block)
return fix_blocks
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
block_bbox) > 0.6:
block_spans.append(span)
block_lines = merge_spans_to_line(block_spans)
# 对line中的span进行排序
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
return block, block_spans
def make_body_block(span: dict, block_bbox: list, block_type: str):
# 创建body_block
body_line = {
'bbox': block_bbox,
'spans': [span],
}
body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
return body_block
def fix_image_block(block, img_blocks):
block['blocks'] = []
# 遍历img_blocks,找到与当前block匹配的img_block
for img_block in img_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
img_block['bbox'], 0.95):
# 创建img_body_block
for span in block['spans']:
if span['type'] == ContentType.Image and img_block[
'img_body_bbox'] == span['bbox']:
# 创建img_body_block
img_body_block = make_body_block(
span, img_block['img_body_bbox'], BlockType.ImageBody)
block['blocks'].append(img_body_block)
# 从spans中移除img_body_block中已经放入的span
block['spans'].remove(span)
break
# 根据list长度,判断img_block中是否有img_caption
if img_block['img_caption_bbox'] is not None:
img_caption_block, img_caption_spans = merge_spans_to_block(
block['spans'], img_block['img_caption_bbox'],
BlockType.ImageCaption)
block['blocks'].append(img_caption_block)
if img_block['img_footnote_bbox'] is not None:
img_footnote_block, img_footnote_spans = merge_spans_to_block(
block['spans'], img_block['img_footnote_bbox'],
BlockType.ImageFootnote)
block['blocks'].append(img_footnote_block)
break
del block['spans']
return block
def fix_table_block(block, table_blocks):
block['blocks'] = []
# 遍历table_blocks,找到与当前block匹配的table_block
for table_block in table_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
table_block['bbox'], 0.95):
# 创建table_body_block
for span in block['spans']:
if span['type'] == ContentType.Table and table_block[
'table_body_bbox'] == span['bbox']:
# 创建table_body_block
table_body_block = make_body_block(
span, table_block['table_body_bbox'],
BlockType.TableBody)
block['blocks'].append(table_body_block)
# 从spans中移除img_body_block中已经放入的span
block['spans'].remove(span)
break
# 根据list长度,判断table_block中是否有caption
if table_block['table_caption_bbox'] is not None:
table_caption_block, table_caption_spans = merge_spans_to_block(
block['spans'], table_block['table_caption_bbox'],
BlockType.TableCaption)
block['blocks'].append(table_caption_block)
# 如果table_caption_block_spans不为空
if len(table_caption_spans) > 0:
# 一些span已经放入了caption_block中,需要从block['spans']中删除
for span in table_caption_spans:
block['spans'].remove(span)
# 根据list长度,判断table_block中是否有table_note
if table_block['table_footnote_bbox'] is not None:
table_footnote_block, table_footnote_spans = merge_spans_to_block(
block['spans'], table_block['table_footnote_bbox'],
BlockType.TableFootnote)
block['blocks'].append(table_footnote_block)
break
del block['spans']
return block
def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.InterlineEquation:
span['type'] = ContentType.InlineEquation
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def fix_interline_block(block):
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
from loguru import logger
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
__is_overlaps_y_exceeds_threshold, calculate_iou
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def remove_overlaps_low_confidence_spans(spans):
dropped_spans = []
# 删除重叠spans中置信度低的的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
if span1['score'] < span2['score']:
span_need_remove = span1
else:
span_need_remove = span2
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans = []
for span in spans:
for removed_bbox in need_remove_spans_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
if span not in need_remove_spans:
need_remove_spans.append(span)
break
if len(need_remove_spans) > 0:
for span in need_remove_spans:
spans.remove(span)
return spans
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
dropped_spans = []
for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
need_remove_spans = []
for span in spans:
# 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
for removed_bbox in removed_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span)
break
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
need_remove_spans.append(span)
break
for span in need_remove_spans:
spans.remove(span)
span['tag'] = drop_tag
dropped_spans.append(span)
return spans, dropped_spans
def adjust_bbox_for_standalone_block(spans):
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for sb_span in spans:
if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
for text_span in spans:
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
# 判断span2的纵向高度是否被span所覆盖
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
# 判断span2是否在span左边
if text_span['bbox'][0] < sb_span['bbox'][0]:
# 调整span的y0和span2的y0一致
sb_span['bbox'][1] = text_span['bbox'][1]
return spans
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# displayed_list = []
# 如果spans为空,则不处理
if len(spans) == 0:
pass
else:
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(spans[0])
line_first_y0 = spans[0]["bbox"][1]
line_first_y = spans[0]["bbox"][3]
# 用于给行间公式搜索
# text_inline_lines = []
for span in spans[1:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
current_line):
# 传入
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(span)
# 则开始新行
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span]
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
if span["type"] == "text":
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span]
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
# 添加最后一行
if current_line:
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
for line in text_inline_lines:
# 按照x0坐标排序
current_line = line[0]
current_line.sort(key=lambda span: span['bbox'][0])
# 调整每一个文字行内bbox统一
for line in text_inline_lines:
current_line, (line_first_y0, line_first_y) = line
for span in current_line:
span["bbox"][1] = line_first_y0
span["bbox"][3] = line_first_y
# return spans, displayed_list, text_inline_lines
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
# 错误行间公式转行内公式
j = 0
for i in range(len(displayed_list)):
# if i == 8:
# print("debug")
span = displayed_list[i]
span_y0, span_y = span["bbox"][1], span["bbox"][3]
while j < len(text_inline_lines):
text_line = text_inline_lines[j]
y0, y1 = text_line[1]
if (
span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
) and __is_overlaps_y_exceeds_threshold(
span['bbox'], (0, y0, 0, y1)
):
# 调整公式类型
if span["type"] == ContentType.InterlineEquation:
# 最后一行是行间公式
if j + 1 >= len(text_inline_lines):
span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0
span["bbox"][3] = y1
else:
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
y0_next, y1_next = text_inline_lines[j + 1][1]
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
y1 - y0) > span_y - span_y0:
span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0
span["bbox"][3] = y1
break
elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
(0, y0, 0, y1)):
break
else:
j += 1
return spans
def get_qa_need_list(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
inline_equations = []
for block in blocks:
for line in block["lines"]:
for span in line["spans"]:
if span["type"] == ContentType.Image:
images.append(span)
elif span["type"] == ContentType.Table:
tables.append(span)
elif span["type"] == ContentType.InlineEquation:
inline_equations.append(span)
elif span["type"] == ContentType.InterlineEquation:
interline_equations.append(span)
else:
continue
return images, tables, interline_equations, inline_equations
def get_qa_need_list_v2(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
for block in blocks:
if block["type"] == BlockType.Image:
images.append(block)
elif block["type"] == BlockType.Table:
tables.append(block)
elif block["type"] == BlockType.InterlineEquation:
interline_equations.append(block)
return images, tables, interline_equations
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
from magic_pdf.libs.drop_reason import DropReason
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
"""
检查page是包含有颜色背景的矩形
"""
color_bg_rect = []
p_width, p_height = page.rect.width, page.rect.height
# 先找到最大的带背景矩形
blocks = page.get_cdrawings()
for block in blocks:
if 'fill' in block and block['fill']: # 过滤掉透明的
fill = list(block['fill'])
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
if fill==(1.0,1.0,1.0):
continue
rect = block['rect']
# 过滤掉特别小的矩形
if __area(rect) < 10*10:
continue
# 为了防止是svg图片上的色块,这里过滤掉这类
if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
continue
color_bg_rect.append(rect)
# 找到最大的背景矩形
if len(color_bg_rect) > 0:
max_rect = max(color_bg_rect, key=lambda x:__area(x))
max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
if max_rect[2]-max_rect[0] > 0.2*p_width and max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
#看是否有文本块落入到这个矩形中
for text_block in text_blocks:
box = text_block['bbox']
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
if _is_in(box_int, max_rect_int):
return True
return False
def __is_table_overlap_text_block(text_blocks, table_bbox):
"""
检查table_bbox是否覆盖了text_blocks里的文本块
TODO
"""
for text_block in text_blocks:
box = text_block['bbox']
if _is_in_or_part_overlap(table_bbox, box):
return True
return False
def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
"""
return:(True|False, err_msg)
True, 如果pdf符合要求
False, 如果pdf不符合要求
"""
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
return True, None
\ No newline at end of file
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
from magic_pdf.libs.drop_reason import DropReason
def _remove_overlap_between_bbox(bbox1, bbox2):
if _is_part_overlap(bbox1, bbox2):
ix0, iy0, ix1, iy1 = bbox1
x0, y0, x1, y1 = bbox2
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid-0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid-0.25)
iy0 = max(mid + 0.25, iy0)
if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
bbox1 = [ix0, iy0, ix1, iy1]
bbox2 = [x0, y0, x1, y1]
return bbox1, bbox2, None
else:
return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
else:
return bbox1, bbox2, None
def _remove_overlap_between_bboxes(arr):
drop_reasons = []
N = len(arr)
keeps = [True] * N
res = [None] * N
for i in range(N):
for j in range(N):
if i == j:
continue
if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
keeps[i] = False
for idx, v in enumerate(arr):
if not keeps[idx]:
continue
for i in range(N):
if res[i] is None:
continue
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
if drop_reason is None:
v["bbox"] = bbox1
res[i]["bbox"] = bbox2
else:
if v["score"] > res[i]["score"]:
keeps[i] = False
res[i] = None
else:
keeps[idx] = False
drop_reasons.append(drop_reasons)
if keeps[idx]:
res[idx] = v
return res, drop_reasons
def remove_overlap_between_bbox_for_span(spans):
arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
spans[i]["bbox"] = res[i]["bbox"]
ret.append(spans[i])
return ret, drop_reasons
def remove_overlap_between_bbox_for_block(all_bboxes):
arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
all_bboxes[i][:4] = res[i]["bbox"]
ret.append(all_bboxes[i])
return ret, drop_reasons
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger
from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def rectangle_position_determination(rect, p_width):
"""
判断矩形是否在页面中轴线附近。
Args:
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
p_width (int): 页面宽度。
Returns:
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
"""
# 页面中轴线x坐标
x_axis = p_width / 2
# 矩形是否跨越中轴线
is_span = rect[0] < x_axis and rect[2] > x_axis
if is_span:
return True
else:
# 矩形与中轴线的距离,只算近的那一边
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
# 判断矩形与中轴线的距离是否小于页面宽度的20%
if distance < p_width * 0.2:
return True
else:
return False
def remove_colored_strip_textblock(remain_text_blocks, page):
"""
根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_strip_textblock。
Args:
remain_text_blocks (list): 剩余文本块列表。
page (Page): 页面对象。
Returns:
tuple: 剩余文本块列表和移除的文本块列表。
"""
colored_strip_textblocks = [] # 先构造一个空的返回
if len(remain_text_blocks) > 0:
p_width, p_height = page.rect.width, page.rect.height
blocks = page.get_cdrawings()
colored_strip_bg_rect = []
for block in blocks:
is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0) # 过滤掉透明的
rect = block['rect']
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
in_upper_half_page = rect[3] < p_height * 0.3 # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4 # 找到长宽比超过4的矩形
if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
colored_strip_bg_rect.append(rect)
if len(colored_strip_bg_rect) > 0:
for colored_strip_block_bbox in colored_strip_bg_rect:
for text_block in remain_text_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
colored_strip_textblocks.append(text_block)
if len(colored_strip_textblocks) > 0:
for colored_strip_textblock in colored_strip_textblocks:
if colored_strip_textblock in remain_text_blocks:
remain_text_blocks.remove(colored_strip_textblock)
return remain_text_blocks, colored_strip_textblocks
import re
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
# span['text'] = ''
span['tag'] = PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines']) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
last_span[
'text']):
last_span['tag'] = PAGE_NO
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
import math
from magic_pdf.libs.boxbase import is_vbox_on_side
from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
def detect_non_horizontal_texts(result_dict):
"""
This function detects watermarks and vertical margin notes in the document.
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
Parameters
----------
result_dict : dict
The result dictionary.
Returns
-------
result_dict : dict
The updated result dictionary.
"""
# Dictionary to store information about potential watermarks
potential_watermarks = {}
potential_margin_notes = {}
for page_id, page_content in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in page_content.items():
if block_id.startswith("block_"):
if "dir" in block_data:
coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
angle = abs(math.degrees(angle))
if angle > 5 and angle < 85: # Check if direction is watermarks
if coordinates_text in potential_watermarks:
potential_watermarks[coordinates_text] += 1
else:
potential_watermarks[coordinates_text] = 1
if angle > 85 and angle < 105: # Check if direction is vertical
if coordinates_text in potential_margin_notes:
potential_margin_notes[coordinates_text] += 1 # Increment count
else:
potential_margin_notes[coordinates_text] = 1 # Initialize count
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
watermark_threshold = len(result_dict) // 2
watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
margin_note_threshold = len(result_dict) // 2
margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
# Add watermark information to the result dictionary
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in blocks.items():
coordinates_text = (block_data["bbox"], block_data["text"])
if coordinates_text in watermarks:
block_data["is_watermark"] = 1
else:
block_data["is_watermark"] = 0
if coordinates_text in margin_notes:
block_data["is_vertical_margin_note"] = 1
else:
block_data["is_vertical_margin_note"] = 0
return result_dict
"""
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
import re
def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
return True
# 判断是否为单个英文单词或字符(包括ASCII标点)
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
return True
else:
return False
def __get_text_color(num):
"""获取字体的颜色RGB值"""
blue = num & 255
green = (num >> 8) & 255
red = (num >> 16) & 255
return red, green, blue
def __is_empty_side_box(text_block):
"""
是否是边缘上的空白没有任何内容的block
"""
for line in text_block['lines']:
for span in line['spans']:
font_color = span['color']
r,g,b = __get_text_color(font_color)
if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
return False
return True
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
"""
返回删除了垂直,水印,旋转的textblock
删除的内容打上tag返回
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
lines = block['lines']
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if is_box_valign:
block['tag'] = VERTICAL_TEXT
removed_text_block.append(block)
continue
for line in lines:
if line['dir']!=(1,0):
block['tag'] = ROTATE_TEXT
removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉
break
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
def get_side_boundry(rotate_bbox, page_width, page_height):
"""
根据rotate_bbox,返回页面的左右正文边界
"""
left_x = 0
right_x = page_width
for x in rotate_bbox:
box = x['bbox']
if box[2]<page_width/2:
left_x = max(left_x, box[2])
else:
right_x = min(right_x, box[0])
return left_x+1, right_x-1
def remove_side_blank_block(pymu_text_block, page_width, page_height):
"""
删除页面两侧的空白block
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if __is_empty_side_box(block):
block['tag'] = EMPTY_SIDE_BLOCK
removed_text_block.append(block)
continue
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment