Commit c2e5c36f authored by 赵小蒙's avatar 赵小蒙
Browse files

Initial commit

parents
"""
去掉正文的引文引用marker
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
"""
import re
from loguru import logger
from libs.nlp_utils import NLPModels
__NLP_MODEL = NLPModels()
def check_1(spans, cur_span_i):
"""寻找前一个char,如果是句号,逗号,那么就是角标"""
if cur_span_i==0:
return False # 不是角标
pre_span = spans[cur_span_i-1]
pre_char = pre_span['chars'][-1]['c']
if pre_char in ['。', ',', '.', ',']:
return True
return False
def check_2(spans, cur_span_i):
"""检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
if cur_span_i==0 and len(spans)>1:
next_span = spans[cur_span_i+1]
next_txt = "".join([c['c'] for c in next_span['chars']])
result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
if result in ["PERSON", "GPE", "ORG"]:
return True
if re.findall(pattern, next_txt):
return True
return False # 不是角标
elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
return False
# 如果这个span是最后一个span,
if cur_span_i==len(spans)-1:
pre_span = spans[cur_span_i-1]
pre_txt = "".join([c['c'] for c in pre_span['chars']])
pre_word = pre_txt.split(' ')[-1]
result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
if result in ["PERSON", "GPE", "ORG"]:
return True
if re.findall(pattern, pre_txt):
return True
return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
pre_span = spans[cur_span_i-1]
next_span = spans[cur_span_i+1]
cur_span = spans[cur_span_i]
# 找到前一个和后一个span里的距离最近的单词
pre_distance = 10000 # 一个很大的数
next_distance = 10000 # 一个很大的数
for c in pre_span['chars'][::-1]:
if c['c'].isalpha():
pre_distance = cur_span['bbox'][0] - c['bbox'][2]
break
for c in next_span['chars']:
if c['c'].isalpha():
next_distance = c['bbox'][0] - cur_span['bbox'][2]
break
if pre_distance<next_distance:
belong_to_span = pre_span
else:
belong_to_span = next_span
txt = "".join([c['c'] for c in belong_to_span['chars']])
pre_word = txt.split(' ')[-1]
result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
if result in ["PERSON", "GPE", "ORG"]:
return True
if re.findall(pattern, txt):
return True
return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
def check_3(spans, cur_span_i):
"""上标里有[], 有*, 有-, 有逗号"""
# 如[2-3],[22]
# 如 2,3,4
cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
bad_char = ['[', ']', '*', ',']
if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
return True
# 如2-3, a-b
patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
for pattern in patterns:
match = re.match(pattern, cur_span_txt)
if match is not None:
return True
return False
def remove_citation_marker(with_char_text_blcoks):
for blk in with_char_text_blcoks:
for line in blk['lines']:
# 如果span里的个数少于2个,那只能忽略,角标不可能自己独占一行
if len(line['spans'])<=1:
continue
# 找到高度最高的span作为位置比较的基准
max_hi_span = line['spans'][0]['bbox']
min_font_sz = 10000
for s in line['spans']:
if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
max_hi_span = s['bbox']
if min_font_sz>s['size']:
min_font_sz = s['size']
base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
span_to_del = []
for i, span in enumerate(line['spans']):
span_hi = span['bbox'][3]-span['bbox'][1]
span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
span_font_sz = span['size']
if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
"""
1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
2. 如果这个角标的前面是一个单词(长度大于5)而不是任何大写或小写的短字母的话 应该也是角标
3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了
4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标
"""
if check_1(line['spans'], i) or check_2(line['spans'], i) or check_3(line['spans'], i):
"""删除掉这个角标:删除这个span, 同时还要更新line的text"""
span_to_del.append(span)
if len(span_to_del)>0:
for span in span_to_del:
line['spans'].remove(span)
line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
return with_char_text_blcoks
def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info, interline_eq_info, raw_pymu_blocks,
removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,layout_tree,
page_w, page_h, footnote_bboxes_tmp):
"""
"""
return_dict = {}
return_dict['para_blocks'] = {}
return_dict['preproc_blocks'] = text_blocks_preproc
return_dict['images'] = image_info
return_dict['tables'] = table_info
return_dict['interline_equations'] = interline_eq_info
return_dict['inline_equations'] = inline_eq_info
return_dict['layout_bboxes'] = layout_bboxes
return_dict['pymu_raw_blocks'] = raw_pymu_blocks
return_dict['global_statistic'] = {}
return_dict['droped_text_block'] = removed_text_blocks
return_dict['droped_image_block'] = removed_image_blocks
return_dict['droped_table_block'] = []
return_dict['image_backup'] = images_backup
return_dict['table_backup'] = []
return_dict['page_idx'] = page_id
return_dict['page_size'] = [page_w, page_h]
return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
return return_dict
from collections import defaultdict
from loguru import logger
from libs.boxbase import _is_in, calculate_iou
def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
def is_single_line_block(block):
# Determine based on the width and height of the block
block_width = block["X1"] - block["X0"]
block_height = block["bbox"][3] - block["bbox"][1]
# If the height of the block is close to the average character height and the width is large, it is considered a single line
return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
"""
This function gets the most common bboxes from the bboxes
Parameters
----------
bboxes : list
bboxes
page_height : float
height of the page
position : str, optional
"top" or "bottom", by default "top"
threshold : float, optional
threshold, by default 0.25
num_bboxes : int, optional
number of bboxes to return, by default 3
min_frequency : int, optional
minimum frequency of the bbox, by default 2
Returns
-------
common_bboxes : list
common bboxes
"""
# Filter bbox by position
if position == "top":
filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
else:
filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
# Find the most common bbox
bbox_count = defaultdict(int)
for bbox in filtered_bboxes:
bbox_count[tuple(bbox)] += 1
# Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
common_bboxes = [
bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
][:num_bboxes]
return common_bboxes
def detect_footer_header2(result_dict, similarity_threshold=0.5):
"""
This function detects the header and footer of the document.
Parameters
----------
result_dict : dict
result dictionary
Returns
-------
result_dict : dict
result dictionary
"""
# Traverse all blocks in the document
single_line_blocks = 0
total_blocks = 0
single_line_blocks = 0
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
total_blocks += 1
if is_single_line_block(block):
single_line_blocks += 1
# If there are no blocks, skip the header and footer detection
if total_blocks == 0:
print("No blocks found. Skipping header/footer detection.")
return result_dict
# If most of the blocks are single-line, skip the header and footer detection
if single_line_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
# print("Skipping header/footer detection for text-dense document.")
return result_dict
# Collect the bounding boxes of all blocks
all_bboxes = []
all_texts = []
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
all_bboxes.append(block["bbox"])
# Get the height of the page
page_height = max(bbox[3] for bbox in all_bboxes)
# Get the most common bbox lists for headers and footers
common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
# Detect and mark headers and footers
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_key, block in blocks.items():
if block_key.startswith("block_"):
bbox = block["bbox"]
text = block["text"]
is_header = compare_bbox_with_list(bbox, common_header_bboxes)
is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
block["is_header"] = int(is_header)
block["is_footer"] = int(is_footer)
return result_dict
def __get_page_size(page_sizes:list):
"""
页面大小可能不一样
"""
w = sum([w for w,h in page_sizes])/len(page_sizes)
h = sum([h for w,h in page_sizes])/len(page_sizes)
return w, h
def __calculate_iou(bbox1, bbox2):
iou = calculate_iou(bbox1, bbox2)
return iou
def __is_same_pos(box1, box2, iou_threshold):
iou = __calculate_iou(box1, box2)
return iou >= iou_threshold
def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9):
"""
common bbox必须大于page_cnt的1/3
"""
min_occurance_cnt = max(3, page_cnt//4)
header_det_bbox = []
footer_det_bbox = []
hdr_same_pos_group = []
btn_same_pos_group = []
page_w, page_h = __get_page_size(page_size)
top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
top_bbox = [b for b in bboxes if b[3]<top_y]
bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
# 然后开始排序,寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
for i in range(0, len(top_bbox)):
hdr_same_pos_group.append([top_bbox[i]])
for j in range(i+1, len(top_bbox)):
if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
#header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
hdr_same_pos_group[i].append(top_bbox[j])
for i in range(0, len(bottom_bbox)):
btn_same_pos_group.append([bottom_bbox[i]])
for j in range(i+1, len(bottom_bbox)):
if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
#footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
btn_same_pos_group[i].append(bottom_bbox[j])
# 然后看下每一组的bbox,是否符合大于page_cnt一定比例
hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
# 平铺2个list[list]
hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
# 寻找hdr_same_pos_group中的box[3]最大值,btn_same_pos_group中的box[1]最小值
hdr_same_pos_group.sort(key=lambda b:b[3])
btn_same_pos_group.sort(key=lambda b:b[1])
hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
header_det_bbox = [0, 0, page_w, hdr_y]
footer_det_bbox = [0, btn_y, page_w, page_h]
# logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
return header_det_bbox, footer_det_bbox, page_w, page_h
def drop_footer_header(pdf_info_dict:dict):
"""
启用规则探测,在全局的视角上通过统计的方法。
"""
header = []
footer = []
all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
page_cnt = len(pdf_info_dict.keys()) # 一共多少页
header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
""""
把范围扩展到页面水平的整个方向上
"""
if header:
header = [0, 0, page_w, header[3]+1]
if footer:
footer = [0, footer[1]-1, page_w, page_h]
# 找到footer, header范围之后,针对每一页pdf,从text、图片中删除这些范围内的内容
# 移除text block
for _, page_info in pdf_info_dict.items():
header_text_blk = []
footer_text_blk = []
for blk in page_info['preproc_blocks']:
blk_bbox = blk['bbox']
if header and blk_bbox[3]<=header[3]:
blk['tag'] = "header"
header_text_blk.append(blk)
elif footer and blk_bbox[1]>=footer[1]:
blk['tag'] = "footer"
footer_text_blk.append(blk)
# 放入text_block_droped中
page_info['droped_text_block'].extend(header_text_blk)
page_info['droped_text_block'].extend(footer_text_blk)
for blk in header_text_blk:
page_info['preproc_blocks'].remove(blk)
for blk in footer_text_blk:
page_info['preproc_blocks'].remove(blk)
"""接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
header_image = []
footer_image = []
for image_info in page_info['images']:
img_bbox = image_info['bbox']
if header and img_bbox[3]<=header[3]:
image_info['tag'] = "header"
header_image.append(image_info)
elif footer and img_bbox[1]>=footer[1]:
image_info['tag'] = "footer"
footer_image.append(image_info)
page_info['droped_image_block'].extend(header_image)
page_info['droped_image_block'].extend(footer_image)
for img in header_image:
page_info['images'].remove(img)
for img in footer_image:
page_info['images'].remove(img)
"""接下来吧backup的图片也删除掉"""
header_image = []
footer_image = []
for image_info in page_info['image_backup']:
img_bbox = image_info['bbox']
if header and img_bbox[3]<=header[3]:
image_info['tag'] = "header"
header_image.append(image_info)
elif footer and img_bbox[1]>=footer[1]:
image_info['tag'] = "footer"
footer_image.append(image_info)
page_info['droped_image_block'].extend(header_image)
page_info['droped_image_block'].extend(footer_image)
for img in header_image:
page_info['image_backup'].remove(img)
for img in footer_image:
page_info['image_backup'].remove(img)
return header, footer
"""
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
"""
import fitz
import json
import os
from pathlib import Path
from loguru import logger
TYPE_INLINE_EQUATION = "inline-equation"
TYPE_INTERLINE_EQUATION = "interline-equation"
def combine_chars_to_pymudict(block_dict, char_dict):
"""
把block级别的pymupdf 结构里加入char结构
"""
# 因为block_dict 被裁剪过,因此先把他和char_dict文字块对齐,才能进行补充
char_map = {tuple(item['bbox']):item for item in char_dict}
for i in range(len(block_dict)): # blcok
block = block_dict[i]
key = block['bbox']
char_dict_item = char_map[tuple(key)]
char_dict_map = {tuple(item['bbox']):item for item in char_dict_item['lines']}
for j in range(len(block['lines'])):
lines = block['lines'][j]
with_char_lines = char_dict_map[lines['bbox']]
for k in range(len(lines['spans'])):
spans = lines['spans'][k]
try:
chars = with_char_lines['spans'][k]['chars']
except Exception as e:
logger.error(char_dict[i]['lines'][j])
spans['chars'] = chars
return block_dict
def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
"""
计算box1和box2的重叠面积占最小面积的box的比例
"""
# Determine the coordinates of the intersection rectangle
x_left = max(bbox1[0], min_bbox[0])
y_top = max(bbox1[1], min_bbox[1])
x_right = min(bbox1[2], min_bbox[2])
y_bottom = min(bbox1[3], min_bbox[3])
if x_right < x_left or y_bottom < y_top:
return 0.0
# The area of overlap area
intersection_area = (x_right - x_left) * (y_bottom - y_top)
min_box_area = (min_bbox[3]-min_bbox[1])*(min_bbox[2]-min_bbox[0])
if min_box_area==0:
return 0
else:
return intersection_area / min_box_area
def _is_xin(bbox1, bbox2):
area1 = abs(bbox1[2]-bbox1[0])*abs(bbox1[3]-bbox1[1])
area2 = abs(bbox2[2]-bbox2[0])*abs(bbox2[3]-bbox2[1])
if area1<area2:
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1)
else:
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
return ratio>0.6
def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
"""消除掉整个块都在行间公式块内部的文本块"""
for eq_bbox in interline_bboxes:
removed_txt_blk = []
for text_blk in text_blocks:
text_bbox = text_blk['bbox']
if calculate_overlap_area_2_minbox_area_ratio(eq_bbox['bbox'], text_bbox)>=0.7:
removed_txt_blk.append(text_blk)
for blk in removed_txt_blk:
text_blocks.remove(blk)
return text_blocks
def _is_in_or_part_overlap(box1, box2) -> bool:
"""
两个bbox是否有部分重叠或者包含
"""
if box1 is None or box2 is None:
return False
x0_1, y0_1, x1_1, y1_1 = box1
x0_2, y0_2, x1_2, y1_2 = box2
return not (x1_1 < x0_2 or # box1在box2的左边
x0_1 > x1_2 or # box1在box2的右边
y1_1 < y0_2 or # box1在box2的上边
y0_1 > y1_2) # box1在box2的下边
def remove_text_block_overlap_interline_equation_bbox(interline_eq_bboxes, pymu_block_list):
"""消除掉行行内公式有部分重叠的文本块的内容。
同时重新计算消除重叠之后文本块的大小"""
deleted_block = []
for text_block in pymu_block_list:
deleted_line = []
for line in text_block['lines']:
deleted_span = []
for span in line['spans']:
deleted_chars = []
for char in span['chars']:
if any([_is_in_or_part_overlap(char['bbox'], eq_bbox['bbox']) for eq_bbox in interline_eq_bboxes]):
deleted_chars.append(char)
# 检查span里没有char则删除这个span
for char in deleted_chars:
span['chars'].remove(char)
# 重新计算这个span的大小
if len(span['chars'])==0: # 删除这个span
deleted_span.append(span)
else:
span['bbox'] = min([b['bbox'][0] for b in span['chars']]),min([b['bbox'][1] for b in span['chars']]),max([b['bbox'][2] for b in span['chars']]), max([b['bbox'][3] for b in span['chars']])
# 检查这个span
for span in deleted_span:
line['spans'].remove(span)
if len(line['spans'])==0: #删除这个line
deleted_line.append(line)
else:
line['bbox'] = min([b['bbox'][0] for b in line['spans']]),min([b['bbox'][1] for b in line['spans']]),max([b['bbox'][2] for b in line['spans']]), max([b['bbox'][3] for b in line['spans']])
# 检查这个block是否可以删除
for line in deleted_line:
text_block['lines'].remove(line)
if len(text_block['lines'])==0: # 删除block
deleted_block.append(text_block)
else:
text_block['bbox'] = min([b['bbox'][0] for b in text_block['lines']]),min([b['bbox'][1] for b in text_block['lines']]),max([b['bbox'][2] for b in text_block['lines']]), max([b['bbox'][3] for b in text_block['lines']])
# 检查text block删除
for block in deleted_block:
pymu_block_list.remove(block)
if len(pymu_block_list)==0:
return []
return pymu_block_list
def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
"""在行间公式对应的地方插上一个伪造的block"""
for eq in interline_eq_bboxes:
bbox = eq['bbox']
latex_content = eq['latex_text']
text_block = {
"number": len(pymu_block_list),
"type": 0,
"bbox": bbox,
"lines": [
{
"spans": [
{
"size": 9.962599754333496,
"_type": TYPE_INTERLINE_EQUATION,
"flags": 4,
"font": TYPE_INTERLINE_EQUATION,
"color": 0,
"ascender": 0.9409999847412109,
"descender": -0.3050000071525574,
"text": f"\n$$\n{latex_content}\n$$\n",
"origin": [
bbox[0],
bbox[1]
],
"bbox": bbox
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": bbox
}
]
}
pymu_block_list.append(text_block)
def x_overlap_ratio(box1, box2):
a, _, c, _ = box1
e, _, g, _ = box2
# 计算重叠宽度
overlap_x = max(min(c, g) - max(a, e), 0)
# 计算box1的宽度
width1 = g - e
# 计算重叠比例
overlap_ratio = overlap_x / width1 if width1 != 0 else 0
return overlap_ratio
def __is_x_dir_overlap(bbox1, bbox2):
return not (bbox1[2]<bbox2[0] or bbox1[0]>bbox2[2])
def __y_overlap_ratio(box1, box2):
""""""
_, b, _, d = box1
_, f, _, h = box2
# 计算重叠高度
overlap_y = max(min(d, h) - max(b, f), 0)
# 计算box1的高度
height1 = d - b
# 计算重叠比例
overlap_ratio = overlap_y / height1 if height1 != 0 else 0
return overlap_ratio
def replace_line_v2(eqinfo, line):
"""
扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
最后与这个x0,x1有相交的span0, span1内部进行分割。
"""
first_overlap_span = -1
first_overlap_span_idx = -1
last_overlap_span = -1
delete_chars = []
for i in range(0, len(line['spans'])):
if line['spans'][i].get("_type", None) is not None:
continue # 忽略,因为已经是插入的伪造span公式了
for char in line['spans'][i]['chars']:
if __is_x_dir_overlap(eqinfo['bbox'], char['bbox']):
line_txt = ""
for span in line['spans']:
span_txt = "<span>"
for ch in span['chars']:
span_txt = span_txt + ch['c']
span_txt = span_txt + "</span>"
line_txt = line_txt + span_txt
if first_overlap_span_idx == -1:
first_overlap_span = line['spans'][i]
first_overlap_span_idx = i
last_overlap_span = line['spans'][i]
delete_chars.append(char)
# 第一个和最后一个char要进行检查,到底属于公式多还是属于正常span多
if len(delete_chars)>0:
ch0_bbox = delete_chars[0]['bbox']
if x_overlap_ratio(eqinfo['bbox'], ch0_bbox)<0.51:
delete_chars.remove(delete_chars[0])
if len(delete_chars)>0:
ch0_bbox = delete_chars[-1]['bbox']
if x_overlap_ratio(eqinfo['bbox'], ch0_bbox)<0.51:
delete_chars.remove(delete_chars[-1])
# 计算x方向上被删除区间内的char的真实x0, x1
if len(delete_chars):
x0, x1 = min([b['bbox'][0] for b in delete_chars]), max([b['bbox'][2] for b in delete_chars])
else:
logger.debug(f"行内公式替换没有发生,尝试下一行匹配, eqinfo={eqinfo}")
return False
# 删除位于x0, x1这两个中间的span
delete_span = []
for span in line['spans']:
span_box = span['bbox']
if x0<=span_box[0] and span_box[2]<=x1:
delete_span.append(span)
for span in delete_span:
line['spans'].remove(span)
equation_span = {
"size": 9.962599754333496,
"_type": TYPE_INLINE_EQUATION,
"flags": 4,
"font": TYPE_INLINE_EQUATION,
"color": 0,
"ascender": 0.9409999847412109,
"descender": -0.3050000071525574,
"text": "",
"origin": [
337.1410153102337,
216.0205245153934
],
"bbox": [
337.1410153102337,
216.0205245153934,
390.4496373892022,
228.50171037628277
]
}
#equation_span = line['spans'][0].copy()
equation_span['text'] = f" ${eqinfo['latex_text']}$ "
equation_span['bbox'] = [x0, equation_span['bbox'][1], x1, equation_span['bbox'][3]]
equation_span['origin'] = [equation_span['bbox'][0], equation_span['bbox'][1]]
equation_span['chars'] = delete_chars
equation_span['_type'] = TYPE_INLINE_EQUATION
equation_span['_eq_bbox'] = eqinfo['bbox']
line['spans'].insert(first_overlap_span_idx+1, equation_span) # 放入公式
# logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
# 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
first_span_chars = [char for char in first_overlap_span['chars'] if (char['bbox'][2]+char['bbox'][0])/2<x0]
tail_span_chars = [char for char in last_overlap_span['chars'] if (char['bbox'][0]+char['bbox'][2])/2>x1]
if len(first_span_chars)>0:
first_overlap_span['chars'] = first_span_chars
first_overlap_span['text'] = ''.join([char['c'] for char in first_span_chars])
first_overlap_span['bbox'] = (first_overlap_span['bbox'][0], first_overlap_span['bbox'][1], max([chr['bbox'][2] for chr in first_span_chars]), first_overlap_span['bbox'][3])
# first_overlap_span['_type'] = "first"
else:
# 删掉
if first_overlap_span not in delete_span:
line['spans'].remove(first_overlap_span)
if len(tail_span_chars)>0:
if last_overlap_span==first_overlap_span: # 这个时候应该插入一个新的
tail_span_txt = ''.join([char['c'] for char in tail_span_chars])
last_span_to_insert = last_overlap_span.copy()
last_span_to_insert['chars'] = tail_span_chars
last_span_to_insert['text'] = ''.join([char['c'] for char in tail_span_chars])
last_span_to_insert['bbox'] = (min([chr['bbox'][0] for chr in tail_span_chars]), last_overlap_span['bbox'][1], last_overlap_span['bbox'][2], last_overlap_span['bbox'][3])
# 插入到公式对象之后
equation_idx = line['spans'].index(equation_span)
line['spans'].insert(equation_idx+1, last_span_to_insert) # 放入公式
else: # 直接修改原来的span
last_overlap_span['chars'] = tail_span_chars
last_overlap_span['text'] = ''.join([char['c'] for char in tail_span_chars])
last_overlap_span['bbox'] = (min([chr['bbox'][0] for chr in tail_span_chars]), last_overlap_span['bbox'][1], last_overlap_span['bbox'][2], last_overlap_span['bbox'][3])
else:
# 删掉
if last_overlap_span not in delete_span and last_overlap_span!=first_overlap_span:
line['spans'].remove(last_overlap_span)
remain_txt = ""
for span in line['spans']:
span_txt = "<span>"
for char in span['chars']:
span_txt = span_txt + char['c']
span_txt = span_txt + "</span>"
remain_txt = remain_txt + span_txt
# logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】")
return True
def replace_eq_blk(eqinfo, text_block):
"""替换行内公式"""
for line in text_block['lines']:
line_bbox = line['bbox']
if _is_xin(eqinfo['bbox'], line_bbox) or __y_overlap_ratio(eqinfo['bbox'], line_bbox)>0.6: # 定位到行, 使用y方向重合率是因为有的时候,一个行的宽度会小于公式位置宽度:行很高,公式很窄,
replace_succ = replace_line_v2(eqinfo, line)
if not replace_succ: # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
continue
else:
break
else:
return False
return True
def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
"""替换行内公式"""
for eqinfo in inline_equation_bboxes:
eqbox = eqinfo['bbox']
for blk in raw_text_blocks:
if _is_xin(eqbox, blk['bbox']):
if not replace_eq_blk(eqinfo, blk):
logger.error(f"行内公式没有替换成功:{eqinfo} ")
else:
break
return raw_text_blocks
def remove_chars_in_text_blocks(text_blocks):
"""删除text_blocks里的char"""
for blk in text_blocks:
for line in blk['lines']:
for span in line['spans']:
_ = span.pop("chars", "no such key")
return text_blocks
def replace_equations_in_textblock(raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes):
"""
替换行间和和行内公式为latex
"""
raw_text_blocks = remove_text_block_in_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消除重叠:第一步,在公式内部的
raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消重,第二步,和公式覆盖的
insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
return raw_text_blocks
def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
"""
"""
new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf"
with open(json_path, "r", encoding='utf-8') as f:
obj = json.loads(f.read())
if os.path.exists(new_pdf):
os.remove(new_pdf)
new_doc = fitz.open('')
doc = fitz.open(pdf_path)
new_doc = fitz.open(pdf_path)
for i in range(len(new_doc)):
page = new_doc[i]
inline_equation_bboxes = obj[f"page_{i}"]['inline_equations']
interline_equation_bboxes = obj[f"page_{i}"]['interline_equations']
raw_text_blocks = obj[f'page_{i}']['preproc_blocks']
raw_text_blocks = remove_text_block_in_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消除重叠:第一步,在公式内部的
raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消重,第二步,和公式覆盖的
insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
# 为了检验公式是否重复,把每一行里,含有公式的span背景改成黄色的
color_map = [fitz.pdfcolor['blue'],fitz.pdfcolor['green']]
j = 0
for blk in raw_text_blocks:
for i,line in enumerate(blk['lines']):
# line_box = line['bbox']
# shape = page.new_shape()
# shape.draw_rect(line_box)
# shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3)
# shape.commit()
# j = j+1
for i, span in enumerate(line['spans']):
shape_page = page.new_shape()
span_type = span.get('_type')
color = fitz.pdfcolor['blue']
if span_type=='first':
color = fitz.pdfcolor['blue']
elif span_type=='tail':
color = fitz.pdfcolor['green']
elif span_type==TYPE_INLINE_EQUATION:
color = fitz.pdfcolor['black']
else:
color = None
b = span['bbox']
shape_page.draw_rect(b)
shape_page.finish(color=None, fill=color, fill_opacity=0.3)
shape_page.commit()
new_doc.save(new_pdf)
logger.info(f"save ok {new_pdf}")
final_json = json.dumps(obj, ensure_ascii=False,indent=2)
with open("equations_test/final_json.json", "w") as f:
f.write(final_json)
return new_pdf
if __name__=="__main__":
# draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
pass
import re
from libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, _is_in, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
from loguru import logger
from libs.textbase import get_text_block_base_info
def fix_image_vertical(image_bboxes:list, text_blocks:list):
"""
修正图片的位置
如果图片与文字block发生一定重叠(也就是图片切到了一部分文字),那么减少图片边缘,让文字和图片不再重叠。
只对垂直方向进行。
"""
for image_bbox in image_bboxes:
for text_block in text_blocks:
text_bbox = text_block["bbox"]
if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
if text_bbox[1] < image_bbox[1]:#在图片上方
image_bbox[1] = text_bbox[3]+1
elif text_bbox[3]>image_bbox[3]:#在图片下方
image_bbox[3] = text_bbox[1]-1
return image_bboxes
def __merge_if_common_edge(bbox1, bbox2):
x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
# 检查是否有公共的水平边
if y_min_1 == y_min_2 or y_max_1 == y_max_2:
# 确保一个框的x范围在另一个框的x范围内
if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
# 检查是否有公共的垂直边
if x_min_1 == x_min_2 or x_max_1 == x_max_2:
# 确保一个框的y范围在另一个框的y范围内
if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
# 如果没有公共边
return None
def fix_seperated_image(image_bboxes:list):
"""
如果2个图片有一个边重叠,那么合并2个图片
"""
new_images = []
droped_img_idx = []
for i in range(0, len(image_bboxes)):
for j in range(i+1, len(image_bboxes)):
new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
if new_img is not None:
new_images.append(new_img)
droped_img_idx.append(i)
droped_img_idx.append(j)
break
for i in range(0, len(image_bboxes)):
if i not in droped_img_idx:
new_images.append(image_bboxes[i])
return new_images
def __check_img_title_pattern(text):
"""
检查文本段是否是表格的标题
"""
patterns = [r"^(fig|figure).*", r"^(scheme).*"]
text = text.strip()
for pattern in patterns:
match = re.match(pattern, text, re.IGNORECASE)
if match:
return True
return False
def __get_fig_caption_text(text_block):
txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
line_cnt = len(text_block['lines'])
txt = txt.replace("Ž . ", '')
return txt, line_cnt
def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
"""
继续向下方寻找和图片caption字号,字体,颜色一样的文字框,合并入caption。
text_block是已经找到的图片catpion(这个caption可能不全,多行被划分到多个pymu block里了)
"""
combined_image_caption_text_block = list(text_block.copy()['bbox'])
base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
while True:
tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
if not tb_add:
break
tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
combined_image_caption_text_block[3] = tb_add['bbox'][3]
else:
break
image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
text_block['_image_caption'] = True
def include_img_title(pymu_blocks, image_bboxes: list):
"""
向上方和下方寻找符合图片title的文本block,合并到图片里
如果图片上下都有fig的情况怎么办?寻找标题距离最近的那个。
---
增加对左侧和右侧图片标题的寻找
"""
for tb in image_bboxes:
# 优先找下方的
max_find_cnt = 3 # 向上,向下最多找3个就停止
temp_box = tb.copy()
while max_find_cnt>0:
text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_btn:
txt, line_cnt = __get_fig_caption_text(text_block_btn)
if len(txt.strip())>0:
if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题,或者有时候图片下方文字没有被图片识别模型放入图片里
max_find_cnt = max_find_cnt - 1
temp_box[3] = text_block_btn['bbox'][3]
continue
else:
break
else:
temp_box[3] = text_block_btn['bbox'][3] # 宽度不变,扩大
max_find_cnt = max_find_cnt - 1
else:
break
max_find_cnt = 3 # 向上,向下最多找3个就停止
temp_box = tb.copy()
while max_find_cnt>0:
text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_top:
txt, line_cnt = __get_fig_caption_text(text_block_top)
if len(txt.strip())>0:
if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
max_find_cnt = max_find_cnt - 1
temp_box[1] = text_block_top['bbox'][1]
continue
else:
break
else:
b = text_block_top['bbox']
temp_box[1] = b[1] # 宽度不变,扩大
max_find_cnt = max_find_cnt - 1
else:
break
if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
btn_text, _ = __get_fig_caption_text(text_block_btn)
top_text, _ = __get_fig_caption_text(text_block_top)
if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
# 取距离图片最近的
btn_text_distance = text_block_btn['bbox'][1] - tb[3]
top_text_distance = tb[1] - text_block_top['bbox'][3]
if btn_text_distance<top_text_distance: # caption在下方
__find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
else:
text_block = text_block_top
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block_btn['_image_caption'] = True
continue
text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
if text_block and text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(text_block)
if __check_img_title_pattern(first_text_line):
# 发现特征之后,继续向相同方向寻找(想同颜色,想同大小,想同字体)的textblock
__find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
continue
text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
if text_block and text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(text_block)
if __check_img_title_pattern(first_text_line):
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block['_image_caption'] = True
continue
"""向左、向右寻找,暂时只寻找一次"""
left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
if left_text_block and left_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(left_text_block)
if __check_img_title_pattern(first_text_line):
tb[0] = min(tb[0], left_text_block['bbox'][0])
tb[1] = min(tb[1], left_text_block['bbox'][1])
tb[2] = max(tb[2], left_text_block['bbox'][2])
tb[3] = max(tb[3], left_text_block['bbox'][3])
left_text_block['_image_caption'] = True
continue
right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
if right_text_block and right_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_fig_caption_text(right_text_block)
if __check_img_title_pattern(first_text_line):
tb[0] = min(tb[0], right_text_block['bbox'][0])
tb[1] = min(tb[1], right_text_block['bbox'][1])
tb[2] = max(tb[2], right_text_block['bbox'][2])
tb[3] = max(tb[3], right_text_block['bbox'][3])
right_text_block['_image_caption'] = True
continue
return image_bboxes
def combine_images(image_bboxes:list):
"""
合并图片,如果图片有重叠,那么合并
"""
new_images = []
droped_img_idx = []
for i in range(0, len(image_bboxes)):
for j in range(i+1, len(image_bboxes)):
if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
# 合并
image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
droped_img_idx.append(j)
for i in range(0, len(image_bboxes)):
if i not in droped_img_idx:
new_images.append(image_bboxes[i])
return new_images
\ No newline at end of file
import collections
def get_main_text_font(pdf_docs):
font_names = collections.Counter()
for page in pdf_docs:
blocks = page.get_text('dict')['blocks']
if blocks is not None:
for block in blocks:
lines = block.get('lines')
if lines is not None:
for line in lines:
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
'font' in span and len(span['text']) > 0]
if span_font:
# main_text_font应该用基于字数最多的字体而不是span级别的统计
# font_names.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font)
for font, count in span_font:
font_names[font] += count
main_text_font = font_names.most_common(1)[0][0]
return main_text_font
from libs.commons import fitz
from libs.boxbase import _is_in, _is_in_or_part_overlap
from libs.drop_reason import DropReason
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
"""
检查page是包含有颜色背景的矩形
"""
color_bg_rect = []
p_width, p_height = page.rect.width, page.rect.height
# 先找到最大的带背景矩形
blocks = page.get_cdrawings()
for block in blocks:
if 'fill' in block and block['fill']: # 过滤掉透明的
fill = list(block['fill'])
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
if fill==(1.0,1.0,1.0):
continue
rect = block['rect']
# 过滤掉特别小的矩形
if __area(rect) < 10*10:
continue
# 为了防止是svg图片上的色块,这里过滤掉这类
if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
continue
color_bg_rect.append(rect)
# 找到最大的背景矩形
if len(color_bg_rect) > 0:
max_rect = max(color_bg_rect, key=lambda x:__area(x))
max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
if max_rect[2]-max_rect[0] > 0.2*p_width and max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
#看是否有文本块落入到这个矩形中
for text_block in text_blocks:
box = text_block['bbox']
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
if _is_in(box_int, max_rect_int):
return True
return False
def __is_table_overlap_text_block(text_blocks, table_bbox):
"""
检查table_bbox是否覆盖了text_blocks里的文本块
TODO
"""
for text_block in text_blocks:
box = text_block['bbox']
if _is_in_or_part_overlap(table_bbox, box):
return True
return False
def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
"""
return:(True|False, err_msg)
True, 如果pdf符合要求
False, 如果pdf不符合要求
"""
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
return False, {"need_drop": True, "drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
return True, None
\ No newline at end of file
from libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger
from libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def rectangle_position_determination(rect, p_width):
"""
判断矩形是否在页面中轴线附近。
Args:
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
p_width (int): 页面宽度。
Returns:
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
"""
# 页面中轴线x坐标
x_axis = p_width / 2
# 矩形是否跨越中轴线
is_span = rect[0] < x_axis and rect[2] > x_axis
if is_span:
return True
else:
# 矩形与中轴线的距离,只算近的那一边
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
# 判断矩形与中轴线的距离是否小于页面宽度的20%
if distance < p_width * 0.2:
return True
else:
return False
def remove_colored_strip_textblock(remain_text_blocks, page):
"""
根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_strip_textblock。
Args:
remain_text_blocks (list): 剩余文本块列表。
page (Page): 页面对象。
Returns:
tuple: 剩余文本块列表和移除的文本块列表。
"""
colored_strip_textblocks = [] # 先构造一个空的返回
if len(remain_text_blocks) > 0:
p_width, p_height = page.rect.width, page.rect.height
blocks = page.get_cdrawings()
colored_strip_bg_rect = []
for block in blocks:
is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0) # 过滤掉透明的
rect = block['rect']
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
in_upper_half_page = rect[3] < p_height * 0.3 # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4 # 找到长宽比超过4的矩形
if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
colored_strip_bg_rect.append(rect)
if len(colored_strip_bg_rect) > 0:
for colored_strip_block_bbox in colored_strip_bg_rect:
for text_block in remain_text_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
colored_strip_textblocks.append(text_block)
if len(colored_strip_textblocks) > 0:
for colored_strip_textblock in colored_strip_textblocks:
if colored_strip_textblock in remain_text_blocks:
remain_text_blocks.remove(colored_strip_textblock)
return remain_text_blocks, colored_strip_textblocks
import json
import math
from libs.boxbase import is_vbox_on_side
def detect_non_horizontal_texts(result_dict):
"""
This function detects watermarks and vertical margin notes in the document.
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
Parameters
----------
result_dict : dict
The result dictionary.
Returns
-------
result_dict : dict
The updated result dictionary.
"""
# Dictionary to store information about potential watermarks
potential_watermarks = {}
potential_margin_notes = {}
for page_id, page_content in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in page_content.items():
if block_id.startswith("block_"):
if "dir" in block_data:
coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
angle = abs(math.degrees(angle))
if angle > 5 and angle < 85: # Check if direction is watermarks
if coordinates_text in potential_watermarks:
potential_watermarks[coordinates_text] += 1
else:
potential_watermarks[coordinates_text] = 1
if angle > 85 and angle < 105: # Check if direction is vertical
if coordinates_text in potential_margin_notes:
potential_margin_notes[coordinates_text] += 1 # Increment count
else:
potential_margin_notes[coordinates_text] = 1 # Initialize count
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
watermark_threshold = len(result_dict) // 2
watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
margin_note_threshold = len(result_dict) // 2
margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
# Add watermark information to the result dictionary
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in blocks.items():
coordinates_text = (block_data["bbox"], block_data["text"])
if coordinates_text in watermarks:
block_data["is_watermark"] = 1
else:
block_data["is_watermark"] = 0
if coordinates_text in margin_notes:
block_data["is_vertical_margin_note"] = 1
else:
block_data["is_vertical_margin_note"] = 0
return result_dict
"""
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
import string, re
def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
return True
# 判断是否为单个英文单词或字符(包括ASCII标点)
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
return True
else:
return False
def __get_text_color(num):
"""获取字体的颜色RGB值"""
blue = num & 255
green = (num >> 8) & 255
red = (num >> 16) & 255
return red, green, blue
def __is_empty_side_box(text_block):
"""
是否是边缘上的空白没有任何内容的block
"""
for line in text_block['lines']:
for span in line['spans']:
font_color = span['color']
r,g,b = __get_text_color(font_color)
if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
return False
return True
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
"""
返回删除了垂直,水印,旋转的textblock
删除的内容打上tag返回
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
lines = block['lines']
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if is_box_valign:
block['tag'] = "vertical-text"
removed_text_block.append(block)
continue
for line in lines:
if line['dir']!=(1,0):
block['tag'] = "rotate"
removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉
break
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
def get_side_boundry(rotate_bbox, page_width, page_height):
"""
根据rotate_bbox,返回页面的左右正文边界
"""
left_x = 0
right_x = page_width
for x in rotate_bbox:
box = x['bbox']
if box[2]<page_width/2:
left_x = max(left_x, box[2])
else:
right_x = min(right_x, box[0])
return left_x+1, right_x-1
def remove_side_blank_block(pymu_text_block, page_width, page_height):
"""
删除页面两侧的空白block
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if __is_empty_side_box(block):
block['tag'] = "empty-side-block"
removed_text_block.append(block)
continue
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
\ No newline at end of file
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""
from libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap, calculate_iou, calculate_overlap_area_2_minbox_area_ratio
def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
1. 去掉图片上的公式
2. 去掉table上的公式
2. 图片和文字block部分重叠,首先丢弃图片
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
5. 去掉表格上的文字
"""
text_block_removed = []
images_backup = []
# 去掉位于图片上的文字block
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, image_box):
text_block['tag'] = "on-image"
text_block_removed.append(text_block)
# 去掉table上的文字block
for table_box in tables:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, table_box):
text_block['tag'] = "on-table"
text_block_removed.append(text_block)
for text_block in text_block_removed:
if text_block in text_raw_blocks:
text_raw_blocks.remove(text_block)
# 第一步去掉在图片上出现的公式box
temp = []
for image_box in images:
for eq1 in interline_equations:
if _is_in_or_part_overlap(image_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(image_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 第二步去掉在表格上出现的公式box
temp = []
for table_box in tables:
for eq1 in interline_equations:
if _is_in_or_part_overlap(table_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(table_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 图片和文字重叠,丢掉图片
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in_or_part_overlap(image_box, text_bbox):
images_backup.append(image_box)
break
for image_box in images_backup:
images.remove(image_box)
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index = []
for i in range(len(images)):
for j in range(i+1, len(images)):
if _is_in_or_part_overlap(images[i], images[j]):
images_dup_index.append(i)
images_dup_index.append(j)
dup_idx = set(images_dup_index)
for img_id in dup_idx:
images_backup.append(images[img_id])
images[img_id] = None
images = [img for img in images if img is not None]
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 当计算完毕layout,这部分再合并回来
text_block_removed_2 = []
# for text_block in text_raw_blocks:
# text_bbox = text_block["bbox"]
# for eq in interline_equations:
# ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
# if ratio>0.05:
# text_block['tag'] = "belong-to-interline-equation"
# text_block_removed_2.append(text_block)
# break
# for tb in text_block_removed_2:
# if tb in text_raw_blocks:
# text_raw_blocks.remove(tb)
# text_block_removed = text_block_removed + text_block_removed_2
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
def check_text_block_horizontal_overlap(text_blocks:list, header, footer) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(text_blocks)==0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in text_blocks)
def __max_y(lst:list):
if len(lst)>0:
return max([item[1] for item in lst])
return page_min_y
def __min_y(lst:list):
if len(lst)>0:
return min([item[3] for item in lst])
return page_max_y
clip_y0 = __max_y(header)
clip_y1 = __min_y(footer)
txt_bboxes = []
for text_block in text_blocks:
bbox = text_block["bbox"]
if bbox[1]>=clip_y0 and bbox[3]<=clip_y1:
txt_bboxes.append(bbox)
for i in range(len(txt_bboxes)):
for j in range(i+1, len(txt_bboxes)):
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
return True
return False
\ No newline at end of file
"""
统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
"""
import os
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json
import re
from libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
## version 2
def get_merged_line(page):
"""
这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线,并且将断开的线段进行了合并。
:param page :fitz读取的当前页的内容
"""
drawings_bbox = []
drawings_line = []
drawings = page.get_drawings() # 提取所有的矢量
for p in drawings:
drawings_bbox.append(p["rect"].irect) # (L, U, R, D)
lines = []
for L, U, R, D in drawings_bbox:
if abs(D - U) <= 3: # 筛出水平的横线
lines.append((L, U, R, D))
U_groups = []
visited = [False for _ in range(len(lines))]
for i, (L1, U1, R1, D1) in enumerate(lines):
if visited[i] == True:
continue
tmp_g = [(L1, U1, R1, D1)]
for j, (L2, U2, R2, D2) in enumerate(lines):
if i == j:
continue
if visited[j] == True:
continue
if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5: # 把高度一致的线放进一个group
tmp_g.append((L2, U2, R2, D2))
visited[j] = True
U_groups.append(tmp_g)
res = []
for group in U_groups:
group.sort(key = lambda LURD: (LURD[0], LURD[2]))
LL, UU, RR, DD = group[0]
for i, (L1, U1, R1, D1) in enumerate(group):
if (L1 - RR) >= 5:
cur_line = (LL, UU, RR, DD)
res.append(cur_line)
LL = L1
else:
RR = max(RR, R1)
cur_line = (LL, UU, RR, DD)
res.append(cur_line)
return res
def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
"""
:param page :fitz读取的当前页的内容
:param table_bboxes: list类型,每一个元素是一个元祖 (L, U, R, D)
:param include_table_title: 是否将表格的标题也圈进来
:param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
"""
drawings_lines = get_merged_line(page)
fix_table_bboxes = []
for table in table_bboxes:
(L, U, R, D) = table
fix_table_L = []
fix_table_U = []
fix_table_R = []
fix_table_D = []
width = R - L
width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
height = D - U
height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
for line in drawings_lines:
if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
if (U - height_range) < line[1] < (U + height_range): # 上边界,在一定的高度范围内
fix_table_U.append(line[1])
fix_table_L.append(line[0])
fix_table_R.append(line[2])
elif (D - height_range) < line[1] < (D + height_range): # 下边界,在一定的高度范围内
fix_table_D.append(line[1])
fix_table_L.append(line[0])
fix_table_R.append(line[2])
if fix_table_U:
U = min(fix_table_U)
if fix_table_D:
D = max(fix_table_D)
if fix_table_L:
L = min(fix_table_L)
if fix_table_R:
R = max(fix_table_R)
if include_table_title: # 需要将表格标题包括
text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] # 所有的text的block
incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))] # 将与表格完全没有任何遮挡的文字筛除掉(比如另一栏的文字)
upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0] # 将在表格线以上的text block筛选出来
sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序,如果是同一个高度,则先左再右
for idx in range(scan_line_num):
if idx+1 <= len(sorted_filtered_text_blocks):
line_temp = sorted_filtered_text_blocks[idx]['lines']
if line_temp:
text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
check_en = re.match('Table', text) # 检查是否有Table开头的(英文)
check_ch = re.match('表', text) # 检查是否有Table开头的(中文)
if check_en or check_ch:
if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
U = sorted_filtered_text_blocks[idx]['bbox'][1]
fix_table_bboxes.append([L-2, U-2, R+2, D+2])
return fix_table_bboxes
def __check_table_title_pattern(text):
"""
检查文本段是否是表格的标题
"""
patterns = [r'^table\s\d+']
for pattern in patterns:
match = re.match(pattern, text, re.IGNORECASE)
if match:
return True
else:
return False
def fix_table_text_block(pymu_blocks, table_bboxes: list):
"""
调整table, 如果table和上下的text block有相交区域,则将table的上下边界调整到text block的上下边界
例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
"""
for tb in table_bboxes:
(L, U, R, D) = tb
for block in pymu_blocks:
if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title,那么不调整。因为下一步会统一调整,如果这里进行了调整,后面的调整会造成调整到其他table的title上(在连续出现2个table的情况下)。
tb[0] = min(tb[0], block['bbox'][0])
tb[1] = min(tb[1], block['bbox'][1])
tb[2] = max(tb[2], block['bbox'][2])
tb[3] = max(tb[3], block['bbox'][3])
block['_table'] = True # 占位,防止其他table再次占用
"""如果是个table的title,但是有部分重叠,那么修正这个title,使得和table不重叠"""
if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
block['bbox'] = list(block['bbox'])
if block['bbox'][3] > U:
block['bbox'][3] = U-1
if block['bbox'][1] < D:
block['bbox'][1] = D+1
return table_bboxes
def __get_table_caption_text(text_block):
txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
line_cnt = len(text_block['lines'])
txt = txt.replace("Ž . ", '')
return txt, line_cnt
def include_table_title(pymu_blocks, table_bboxes: list):
"""
把表格的title也包含进来,扩展到table_bbox上
"""
for tb in table_bboxes:
max_find_cnt = 3 # 上上最多找3次
temp_box = tb.copy()
while max_find_cnt>0:
text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_top:
txt, line_cnt = __get_table_caption_text(text_block_top)
if len(txt.strip())>0:
if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
max_find_cnt = max_find_cnt -1
temp_box[1] = text_block_top['bbox'][1]
continue
else:
break
else:
temp_box[1] = text_block_top['bbox'][1] # 宽度不变,扩大
max_find_cnt = max_find_cnt - 1
else:
break
max_find_cnt = 3 # 向下找
temp_box = tb.copy()
while max_find_cnt>0:
text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
if text_block_bottom:
txt, line_cnt = __get_table_caption_text(text_block_bottom)
if len(txt.strip())>0:
if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
max_find_cnt = max_find_cnt - 1
temp_box[3] = text_block_bottom['bbox'][3]
continue
else:
break
else:
temp_box[3] = text_block_bottom['bbox'][3]
max_find_cnt = max_find_cnt - 1
else:
break
if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
btn_text, _ = __get_table_caption_text(text_block_bottom)
top_text, _ = __get_table_caption_text(text_block_top)
if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
# 取距离最近的
btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
top_text_distance = tb[1] - text_block_top['bbox'][3]
text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block_bottom['_table_caption'] = True
continue
# 如果以上条件都不满足,那么就向下找
text_block = text_block_top
if text_block and text_block.get("_table_caption", False) is False:
first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block['_table_caption'] = True
continue
text_block = text_block_bottom
if text_block and text_block.get("_table_caption", False) is False:
first_text_line, _ = __get_table_caption_text(text_block)
if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
tb[0] = min(tb[0], text_block['bbox'][0])
tb[1] = min(tb[1], text_block['bbox'][1])
tb[2] = max(tb[2], text_block['bbox'][2])
tb[3] = max(tb[3], text_block['bbox'][3])
text_block['_table_caption'] = True
continue
"""向左、向右寻找,暂时只寻找一次"""
left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
if left_text_block and left_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_table_caption_text(left_text_block)
if __check_table_title_pattern(first_text_line):
tb[0] = min(tb[0], left_text_block['bbox'][0])
tb[1] = min(tb[1], left_text_block['bbox'][1])
tb[2] = max(tb[2], left_text_block['bbox'][2])
tb[3] = max(tb[3], left_text_block['bbox'][3])
left_text_block['_image_caption'] = True
continue
right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
if right_text_block and right_text_block.get("_image_caption", False) is False:
first_text_line, _ = __get_table_caption_text(right_text_block)
if __check_table_title_pattern(first_text_line):
tb[0] = min(tb[0], right_text_block['bbox'][0])
tb[1] = min(tb[1], right_text_block['bbox'][1])
tb[2] = max(tb[2], right_text_block['bbox'][2])
tb[3] = max(tb[3], right_text_block['bbox'][3])
right_text_block['_image_caption'] = True
continue
return table_bboxes
\ No newline at end of file
import os
import sys
from pathlib import Path
import click
import json
from loguru import logger
from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
from mkcontent import mk_nlp_markdown
from pdf2md import main
from pdf_parse_by_model import parse_pdf_by_model
@click.command()
@click.option("--pdf-file-path", help="s3上pdf文件的路径")
@click.option("--pdf-name", help="pdf name")
def main_shell(pdf_file_path: str, pdf_name: str):
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
samples = json.load(f)
for sample in samples:
pdf_file_path = sample['s3_path']
pdf_bin_file_profile = "outsider"
pdf_name = sample['pdf_name']
pdf_model_dir = f"s3://llm-pdf-text/eval_1k/layout_res/{pdf_name}"
pdf_model_profile = "langchao"
p = Path(pdf_file_path)
pdf_file_name = p.name # pdf文件名字,含后缀
#pdf_model_dir = join_path(pdf_model_parent_dir, pdf_file_name)
main(
pdf_file_path,
pdf_bin_file_profile,
pdf_model_dir,
pdf_model_profile,
debug_mode=True,
)
if __name__ == "__main__":
main_shell()
# from app.common import s3
import boto3
from botocore.client import Config
from spark import s3_buckets, s3_clusters, get_cluster_name, s3_users
import re
import random
from typing import Dict, Iterator, List, Tuple, Union
__re_s3_path = re.compile("^s3a?://([^/]+)(?:/(.*))?$")
def get_s3_config(path: Union[str, List[str]], outside=False):
paths = [path] if type(path) == str else path
bucket_config = None
for p in paths:
bc = __get_s3_bucket_config(p)
if bucket_config in [bc, None]:
bucket_config = bc
continue
raise Exception(f"{paths} have different s3 config, cannot read together.")
if not bucket_config:
raise Exception("path is empty.")
return __get_s3_config(bucket_config, outside, prefer_ip=True)
def __get_s3_config(
bucket_config: tuple,
outside: bool,
prefer_ip=False,
prefer_auto=False,
):
cluster, user = bucket_config
cluster_config = s3_clusters[cluster]
if outside:
endpoint_key = "outside"
elif prefer_auto and "auto" in cluster_config:
endpoint_key = "auto"
elif cluster_config.get("cluster") == get_cluster_name():
endpoint_key = "inside"
else:
endpoint_key = "outside"
if prefer_ip and f"{endpoint_key}_ips" in cluster_config:
endpoint_key = f"{endpoint_key}_ips"
endpoints = cluster_config[endpoint_key]
endpoint = random.choice(endpoints)
return {"endpoint": endpoint, **s3_users[user]}
def split_s3_path(path: str):
"split bucket and key from path"
m = __re_s3_path.match(path)
if m is None:
return "", ""
return m.group(1), (m.group(2) or "")
def __get_s3_bucket_config(path: str):
bucket = split_s3_path(path)[0] if path else ""
bucket_config = s3_buckets.get(bucket)
if not bucket_config:
bucket_config = s3_buckets.get("[default]")
assert bucket_config is not None
return bucket_config
def get_s3_client(path: Union[str, List[str]], outside=False):
s3_config = get_s3_config(path, outside)
try:
return boto3.client(
"s3",
aws_access_key_id=s3_config["ak"],
aws_secret_access_key=s3_config["sk"],
endpoint_url=s3_config["endpoint"],
config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8, "mode": "standard"}),
)
except:
# older boto3 do not support retries.mode param.
return boto3.client(
"s3",
aws_access_key_id=s3_config["ak"],
aws_secret_access_key=s3_config["sk"],
endpoint_url=s3_config["endpoint"],
config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8}),
)
\ No newline at end of file
scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178
scihub/scihub_07400000/libgen.scimag07481000-07481999.zip_10.1007/s003960050343
scihub/scihub_11400000/libgen.scimag11451000-11451999.zip_10.1017/s0009838811000231
scihub/scihub_24400000/libgen.scimag24401000-24401999.zip_10.1016/j.toxicon.2014.02.018
scihub/scihub_27400000/libgen.scimag27441000-27441999.zip_10.2307/30122482
scihub/scihub_28400000/libgen.scimag28413000-28413999.zip_10.2307/1316224
scihub/scihub_31200000/libgen.scimag31207000-31207999.zip_10.1080/03639040600920622
scihub/scihub_31800000/libgen.scimag31824000-31824999.zip_10.1109/med.2012.6265668
scihub/scihub_32500000/libgen.scimag32539000-32539999.zip_10.1080/09540121003721000
scihub/scihub_42500000/libgen.scimag42522000-42522999.zip_10.1016/S1365-6937(15)30162-3
scihub/scihub_45900000/libgen.scimag45914000-45914999.zip_10.1055/s-0030-1256333
scihub/scihub_50900000/libgen.scimag50902000-50902999.zip_10.1007/s12274-016-1035-8
scihub/scihub_63900000/libgen.scimag63921000-63921999.zip_10.1063/1.4938050
scihub/scihub_65800000/libgen.scimag65832000-65832999.zip_10.1016/s0166-4115(08)62165-2
scihub/scihub_67300000/libgen.scimag67369000-67369999.zip_10.1096/fj.201700997R
scihub/scihub_67900000/libgen.scimag67967000-67967999.zip_10.1038/s41598-018-21867-z
scihub/scihub_77400000/libgen.scimag77447000-77447999.zip_10.1016/j.jid.2019.06.094
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment