Unverified Commit 61e88cb2 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1086 from myhloli/dev

refactor(txt_spans_extract_v2): optimize span processing and OCR logic
parents 6c4040ac 160624bd
......@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
first_span = first_line['spans'][0]
if len(first_span['content']) > 0:
span_start_with_num = first_span['content'][0].isdigit()
span_start_with_big_char = first_span['content'][0].isupper()
if (
abs(block2['bbox_fs'][2] - last_line['bbox'][2])
< line_height
# 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
# 上一个block的最后一个span不是以特定符号结尾
and not last_span['content'].endswith(LINE_STOP_FLAG)
# 两个block宽度差距超过2倍也不合并
and abs(block1_weight - block2_weight) < min_block_weight
# 下一个block的第一个字符是数字
and not span_start_with_num
# 下一个block的第一个字符是大写字母
and not span_start_with_big_char
):
if block1['page_num'] != block2['page_num']:
for line in block1['lines']:
......
......@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
def chars_to_content(span):
# # 先给chars按char['bbox']的x坐标排序
# span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
# 检查span中的char是否为空
if len(span['chars']) == 0:
span['content'] = ''
else:
# 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
content = ''
# 求char的平均宽度
if len(span['chars']) == 0:
span['content'] = content
del span['chars']
return
else:
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
char_avg_width = char_width_sum / len(span['chars'])
content = ''
for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' '
content += char['c']
span['content'] = __replace_STX_ETX(content)
del span['chars']
......@@ -128,8 +125,13 @@ def fill_char_in_spans(spans, all_chars):
span['chars'].append(char)
break
empty_spans = []
for span in spans:
chars_to_content(span)
if len(span['content']) == 0:
empty_spans.append(span)
return empty_spans
# 使用鲁棒性更强的中心点坐标判断
......@@ -162,48 +164,37 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars = []
for block in text_blocks:
for line in block['lines']:
for span in line['spans']:
all_pymu_chars.extend(span['chars'])
useful_spans = []
unuseful_spans = []
for span in spans:
for block in all_bboxes:
for block in all_bboxes + all_discarded_blocks:
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
continue
else:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if block in all_bboxes:
useful_spans.append(span)
break
for block in all_discarded_blocks:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
else:
unuseful_spans.append(span)
break
text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars = []
for block in text_blocks:
for line in block['lines']:
for span in line['spans']:
all_pymu_chars.extend(span['chars'])
new_spans = []
for span in useful_spans:
if span['type'] in [ContentType.Text]:
span['chars'] = []
new_spans.append(span)
for span in unuseful_spans:
for span in useful_spans + unuseful_spans:
if span['type'] in [ContentType.Text]:
span['chars'] = []
new_spans.append(span)
fill_char_in_spans(new_spans, all_pymu_chars)
empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
empty_spans = []
for span in new_spans:
if len(span['content']) == 0:
empty_spans.append(span)
if len(empty_spans) > 0:
# 初始化ocr模型
......@@ -216,18 +207,16 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
)
for span in empty_spans:
spans.remove(span)
# 对span的bbox截图
# 对span的bbox截图再ocr
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
ocr_res = ocr_model.ocr(span_img, det=False)
# logger.info(f"ocr_res: {ocr_res}")
# logger.info(f"empty_span: {span}")
if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0:
ocr_text, ocr_score = ocr_res[0][0]
if ocr_score > 0.5 and len(ocr_text) > 0:
span['content'] = ocr_text
spans.append(span)
else:
spans.remove(span)
return spans
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment