"vscode:/vscode.git/clone" did not exist on "190527f4e6aa4b2ad02902e26409664aea3ced8c"
Commit 034c59a8 authored by myhloli's avatar myhloli
Browse files

refactor(txt_spans_extract_v2): optimize span processing and OCR logic

- Merge useful_spans and unuseful_spans handling
- Simplify overlap ratio calculation and block type checking
- Remove unnecessary span removal and re-addition
parent 0d3ef89f
...@@ -128,8 +128,13 @@ def fill_char_in_spans(spans, all_chars): ...@@ -128,8 +128,13 @@ def fill_char_in_spans(spans, all_chars):
span['chars'].append(char) span['chars'].append(char)
break break
empty_spans = []
for span in spans: for span in spans:
chars_to_content(span) chars_to_content(span)
if len(span['content']) == 0:
empty_spans.append(span)
return empty_spans
# 使用鲁棒性更强的中心点坐标判断 # 使用鲁棒性更强的中心点坐标判断
...@@ -162,21 +167,6 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): ...@@ -162,21 +167,6 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
useful_spans = []
unuseful_spans = []
for span in spans:
for block in all_bboxes:
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
continue
else:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
useful_spans.append(span)
break
for block in all_discarded_blocks:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
unuseful_spans.append(span)
break
text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# @todo: 拿到char之后把倾斜角度较大的先删一遍 # @todo: 拿到char之后把倾斜角度较大的先删一遍
...@@ -186,24 +176,29 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -186,24 +176,29 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
for span in line['spans']: for span in line['spans']:
all_pymu_chars.extend(span['chars']) all_pymu_chars.extend(span['chars'])
new_spans = [] useful_spans = []
unuseful_spans = []
for span in spans:
for block in all_bboxes + all_discarded_blocks:
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
continue
overlap_ratio = calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4])
if overlap_ratio > 0.5:
if block in all_bboxes:
useful_spans.append(span)
else:
unuseful_spans.append(span)
break
for span in useful_spans: new_spans = []
if span['type'] in [ContentType.Text]:
span['chars'] = []
new_spans.append(span)
for span in unuseful_spans: for span in useful_spans + unuseful_spans:
if span['type'] in [ContentType.Text]: if span['type'] in [ContentType.Text]:
span['chars'] = [] span['chars'] = []
new_spans.append(span) new_spans.append(span)
fill_char_in_spans(new_spans, all_pymu_chars) empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
empty_spans = []
for span in new_spans:
if len(span['content']) == 0:
empty_spans.append(span)
if len(empty_spans) > 0: if len(empty_spans) > 0:
# 初始化ocr模型 # 初始化ocr模型
...@@ -216,18 +211,14 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -216,18 +211,14 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
) )
for span in empty_spans: for span in empty_spans:
spans.remove(span) # 对span的bbox截图再ocr
# 对span的bbox截图
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2") span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
ocr_res = ocr_model.ocr(span_img, det=False) ocr_res = ocr_model.ocr(span_img, det=False)
# logger.info(f"ocr_res: {ocr_res}")
# logger.info(f"empty_span: {span}")
if ocr_res and len(ocr_res) > 0: if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0: if len(ocr_res[0]) > 0:
ocr_text, ocr_score = ocr_res[0][0] ocr_text, ocr_score = ocr_res[0][0]
if ocr_score > 0.5 and len(ocr_text) > 0: if ocr_score > 0.5 and len(ocr_text) > 0:
span['content'] = ocr_text span['content'] = ocr_text
spans.append(span)
return spans return spans
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment