Unverified Commit 61e88cb2 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1086 from myhloli/dev

refactor(txt_spans_extract_v2): optimize span processing and OCR logic
parents 6c4040ac 160624bd
...@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2): ...@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
first_span = first_line['spans'][0] first_span = first_line['spans'][0]
if len(first_span['content']) > 0: if len(first_span['content']) > 0:
span_start_with_num = first_span['content'][0].isdigit() span_start_with_num = first_span['content'][0].isdigit()
span_start_with_big_char = first_span['content'][0].isupper()
if ( if (
abs(block2['bbox_fs'][2] - last_line['bbox'][2]) # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
< line_height abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
# 上一个block的最后一个span不是以特定符号结尾
and not last_span['content'].endswith(LINE_STOP_FLAG) and not last_span['content'].endswith(LINE_STOP_FLAG)
# 两个block宽度差距超过2倍也不合并 # 两个block宽度差距超过2倍也不合并
and abs(block1_weight - block2_weight) < min_block_weight and abs(block1_weight - block2_weight) < min_block_weight
# 下一个block的第一个字符是数字
and not span_start_with_num and not span_start_with_num
# 下一个block的第一个字符是大写字母
and not span_start_with_big_char
): ):
if block1['page_num'] != block2['page_num']: if block1['page_num'] != block2['page_num']:
for line in block1['lines']: for line in block1['lines']:
......
...@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str): ...@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
def chars_to_content(span): def chars_to_content(span):
# # 先给chars按char['bbox']的x坐标排序 # 检查span中的char是否为空
# span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0]) if len(span['chars']) == 0:
span['content'] = ''
else:
# 先给chars按char['bbox']的中心点的x坐标排序 # 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2) span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
content = ''
# 求char的平均宽度 # 求char的平均宽度
if len(span['chars']) == 0:
span['content'] = content
del span['chars']
return
else:
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']]) char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
char_avg_width = char_width_sum / len(span['chars']) char_avg_width = char_width_sum / len(span['chars'])
content = ''
for char in span['chars']: for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格 # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width: if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' ' content += ' '
content += char['c'] content += char['c']
span['content'] = __replace_STX_ETX(content) span['content'] = __replace_STX_ETX(content)
del span['chars'] del span['chars']
...@@ -128,8 +125,13 @@ def fill_char_in_spans(spans, all_chars): ...@@ -128,8 +125,13 @@ def fill_char_in_spans(spans, all_chars):
span['chars'].append(char) span['chars'].append(char)
break break
empty_spans = []
for span in spans: for span in spans:
chars_to_content(span) chars_to_content(span)
if len(span['content']) == 0:
empty_spans.append(span)
return empty_spans
# 使用鲁棒性更强的中心点坐标判断 # 使用鲁棒性更强的中心点坐标判断
...@@ -162,48 +164,37 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): ...@@ -162,48 +164,37 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars = []
for block in text_blocks:
for line in block['lines']:
for span in line['spans']:
all_pymu_chars.extend(span['chars'])
useful_spans = [] useful_spans = []
unuseful_spans = [] unuseful_spans = []
for span in spans: for span in spans:
for block in all_bboxes: for block in all_bboxes + all_discarded_blocks:
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]: if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
continue continue
else:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5: if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if block in all_bboxes:
useful_spans.append(span) useful_spans.append(span)
break else:
for block in all_discarded_blocks:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
unuseful_spans.append(span) unuseful_spans.append(span)
break break
text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars = []
for block in text_blocks:
for line in block['lines']:
for span in line['spans']:
all_pymu_chars.extend(span['chars'])
new_spans = [] new_spans = []
for span in useful_spans: for span in useful_spans + unuseful_spans:
if span['type'] in [ContentType.Text]:
span['chars'] = []
new_spans.append(span)
for span in unuseful_spans:
if span['type'] in [ContentType.Text]: if span['type'] in [ContentType.Text]:
span['chars'] = [] span['chars'] = []
new_spans.append(span) new_spans.append(span)
fill_char_in_spans(new_spans, all_pymu_chars) empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
empty_spans = []
for span in new_spans:
if len(span['content']) == 0:
empty_spans.append(span)
if len(empty_spans) > 0: if len(empty_spans) > 0:
# 初始化ocr模型 # 初始化ocr模型
...@@ -216,18 +207,16 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -216,18 +207,16 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
) )
for span in empty_spans: for span in empty_spans:
spans.remove(span) # 对span的bbox截图再ocr
# 对span的bbox截图
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2") span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
ocr_res = ocr_model.ocr(span_img, det=False) ocr_res = ocr_model.ocr(span_img, det=False)
# logger.info(f"ocr_res: {ocr_res}")
# logger.info(f"empty_span: {span}")
if ocr_res and len(ocr_res) > 0: if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0: if len(ocr_res[0]) > 0:
ocr_text, ocr_score = ocr_res[0][0] ocr_text, ocr_score = ocr_res[0][0]
if ocr_score > 0.5 and len(ocr_text) > 0: if ocr_score > 0.5 and len(ocr_text) > 0:
span['content'] = ocr_text span['content'] = ocr_text
spans.append(span) else:
spans.remove(span)
return spans return spans
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment