and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
):
new_line["bbox"] = (
min(new_line["bbox"][0], raw_line_bbox[0]), # left
new_line["bbox"][1], # top
max(new_line["bbox"][2], raw_line_bbox[2]), # right
raw_line_bbox[3], # bottom
)
new_line["text"] += " " + raw_line_text
new_line["spans"].extend(raw_line_spans)
new_line["dir"] = (
new_line["dir"][0] + raw_line_dir[0],
new_line["dir"][1] + raw_line_dir[1],
)
else:
new_lines.append(new_line)
new_line = {
"bbox": raw_line_bbox,
"text": raw_line_text,
"dir": raw_line_dir if raw_line_dir else (0, 0),
"spans": raw_line_spans,
}
if new_line:
new_lines.append(new_line)
return new_lines
def __make_new_block(self, raw_block):
"""
This function makes a new block.
Parameters
----------
self : object
The instance of the class.
----------
raw_block : dict
a raw block
Returns
-------
new_block : dict
Schema of new_block:
{
"block_id": "block_1",
"bbox": [0, 0, 100, 100],
"text": "This is a block.",
"lines": [
{
"bbox": [0, 0, 100, 100],
"text": "This is a line.",
"spans": [
{
"text": "This is a span.",
"font": "Times New Roman",
"size": 12,
"color": "#000000",
}
],
}
],
}
"""
new_block = {}
block_id = raw_block["number"]
block_bbox = raw_block["bbox"]
block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
raw_lines = raw_block["lines"]
block_lines = self.__make_new_lines(raw_lines)
new_block["block_id"] = block_id
new_block["bbox"] = block_bbox
new_block["text"] = block_text
new_block["lines"] = block_lines
return new_block
def batch_process_blocks(self, pdf_dic):
"""
This function processes the blocks in batch.
Parameters
----------
self : object
The instance of the class.
----------
blocks : list
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
Returns
-------
result_dict : dict
result dictionary
"""
for page_id, blocks in pdf_dic.items():
if page_id.startswith("page_"):
para_blocks = []
if "para_blocks" in blocks.keys():
input_blocks = blocks["para_blocks"]
for input_block in input_blocks:
new_block = self.__make_new_block(input_block)
para_blocks.append(new_block)
blocks["para_blocks"] = para_blocks
return pdf_dic
class DocStatisticsCalculator:
def __init__(self) -> None:
pass
def calc_stats_of_doc(self, pdf_dict):
"""
This function computes the statistics of the document
Parameters
----------
result_dict : dict
result dictionary
Returns
-------
statistics : dict
statistics of the document
"""
total_text_length = 0
total_num_blocks = 0
for page_id, blocks in pdf_dict.items():
if page_id.startswith("page_"):
if "para_blocks" in blocks.keys():
para_blocks = blocks["para_blocks"]
for para_block in para_blocks:
total_text_length += len(para_block["text"])
total_num_blocks += 1
avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
is_not_end_with_ending_puncs # not end with ending punctuation marks
and is_not_only_no_meaning_symbols # not only have no meaning symbols
and is_title_by_len # is a title by length, default max length is 200
and not is_equation # an interline equation should never be a title
and is_potential_title_font # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
and (
(is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
or (
is_much_larger_font_than_doc_avg
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
)
or (
is_font_size_little_less_than_doc_avg
and is_bold_font
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
)
) # not the same font type as the document average font type, which includes the most common font type and the second most common font type
and (
(
not is_person_or_org_list_line_by_nlp
and (
is_much_larger_font_than_doc_avg
or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
)
)
or (
not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
and not is_a_left_inline_title
and not is_punctuation_heavy
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
)
or (
is_person_or_org_list_line_by_nlp
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
)
or (is_numbered_title and not is_a_left_inline_title)
)
)
# ) or (is_similar_to_pre_line and prev_line_is_title)
is_name_or_org_list_to_be_removed = (
(is_person_or_org_list_line_by_nlp)
and is_punctuation_heavy
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
) and not is_title
if is_name_or_org_list_to_be_removed:
is_author_or_org_list = True
# print curr_line_text to check
# print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
else:
is_author_or_org_list = False
"""
# print reason why the line is a title
if is_title:
print_green("This line is a title.")
print_green("↓" * 10)
print()
print("curr_line_text: ", curr_line_text)
print()
# print reason why the line is not a title
line_text = curr_line_text.strip()
test_text = "Career/Personal Life"
text_content_condition = line_text == test_text
if not is_title and text_content_condition: # Print specific line
is_not_end_with_ending_puncs # not end with ending punctuation marks
and is_not_only_no_meaning_symbols # not only have no meaning symbols
and is_title_by_len # is a title by length, default max length is 200
and not is_equation # an interline equation should never be a title
and is_potential_title_font # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
and (
(is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
or (
is_much_larger_font_than_doc_avg
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
)
or (
is_font_size_little_less_than_doc_avg
and is_bold_font
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
)
) # Consider the following situations: bold font, much larger font than doc avg, not same font type as doc avg, sufficient spacing above and below
and (
(
not is_person_or_org_list_line_by_nlp
and (
is_much_larger_font_than_doc_avg
or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
)
)
or (
not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
and not is_a_left_inline_title
and not is_punctuation_heavy
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
)
or (
is_person_or_org_list_line_by_nlp
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
)
or (is_numbered_title and not is_a_left_inline_title)
) # Exclude the following situations: person/org list
)
# ) or (prev_line_is_title and is_consis_sub_title)
is_name_or_org_list_to_be_removed = (
(is_person_or_org_list_line_by_nlp)
and is_punctuation_heavy
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
) and not is_title
if is_name_or_org_list_to_be_removed:
is_author_or_org_list = True
else:
is_author_or_org_list = False
# return is_title, is_author_or_org_list
"""
# print reason why the line is a title
if is_title:
print_green("This line is a title.")
print_green("↓" * 10)
print()
print("curr_line_text: ", curr_line_text)
print()
# print reason why the line is not a title
line_text = curr_line_text.strip()
test_text = "Career/Personal Life"
text_content_condition = line_text == test_text
if not is_title and text_content_condition: # Print specific line
This function processes the paragraphs, including:
1. Read raw input json file into pdf_dic
2. Detect and replace equations
3. Combine spans into a natural line
4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
5. Compute statistics for each block
6. Detect titles in the document
7. Detect paragraphs inside each block
8. Divide the level of the titles
9. Detect and combine paragraphs from different blocks into one paragraph
10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
11. Draw annotations on the pdf file
Parameters
----------
pdf_dic_json_fpath : str
path to the pdf dictionary json file.
Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
input_pdf_doc : str
path to the input pdf file
output_pdf_path : str
path to the output pdf file
Returns
-------
pdf_dict : dict
result dictionary
"""
error_info = None
output_json_file = ""
output_dir = ""
if input_pdf_path is not None:
input_pdf_path = os.path.abspath(input_pdf_path)
# print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
if text_bbox[1] < image_bbox[1]:#在图片上方
image_bbox[1] = text_bbox[3]+1
elif text_bbox[3]>image_bbox[3]:#在图片下方
image_bbox[3] = text_bbox[1]-1
return image_bboxes
def __merge_if_common_edge(bbox1, bbox2):
x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
# 检查是否有公共的水平边
if y_min_1 == y_min_2 or y_max_1 == y_max_2:
# 确保一个框的x范围在另一个框的x范围内
if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
max_find_cnt = max_find_cnt - 1
temp_box[1] = text_block_top['bbox'][1]
continue
else:
break
else:
b = text_block_top['bbox']
temp_box[1] = b[1] # 宽度不变,扩大
max_find_cnt = max_find_cnt - 1
else:
break
if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))] # 将与表格完全没有任何遮挡的文字筛除掉(比如另一栏的文字)
upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0] # 将在表格线以上的text block筛选出来
if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title,那么不调整。因为下一步会统一调整,如果这里进行了调整,后面的调整会造成调整到其他table的title上(在连续出现2个table的情况下)。
if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
max_find_cnt = max_find_cnt - 1
temp_box[3] = text_block_bottom['bbox'][3]
continue
else:
break
else:
temp_box[3] = text_block_bottom['bbox'][3]
max_find_cnt = max_find_cnt - 1
else:
break
if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :