Commit 14656085 authored by myhloli's avatar myhloli
Browse files

refactor(pdf_parse): improve text content extraction from PDF spans

- Optimize character sorting for accurate text assembly
- Handle empty char scenarios to prevent errors
- Remove unnecessary comments and improve code readability
- Enhance OCR text content handling by removing low-confidence spans
parent 7964ae45
...@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str): ...@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
def chars_to_content(span): def chars_to_content(span):
# # 先给chars按char['bbox']的x坐标排序 # 检查span中的char是否为空
# span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0]) if len(span['chars']) == 0:
span['content'] = ''
else:
# 先给chars按char['bbox']的中心点的x坐标排序 # 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2) span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
content = ''
# 求char的平均宽度 # 求char的平均宽度
if len(span['chars']) == 0:
span['content'] = content
del span['chars']
return
else:
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']]) char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
char_avg_width = char_width_sum / len(span['chars']) char_avg_width = char_width_sum / len(span['chars'])
content = ''
for char in span['chars']: for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格 # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width: if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' ' content += ' '
content += char['c'] content += char['c']
span['content'] = __replace_STX_ETX(content) span['content'] = __replace_STX_ETX(content)
del span['chars'] del span['chars']
...@@ -218,6 +215,8 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -218,6 +215,8 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
ocr_text, ocr_score = ocr_res[0][0] ocr_text, ocr_score = ocr_res[0][0]
if ocr_score > 0.5 and len(ocr_text) > 0: if ocr_score > 0.5 and len(ocr_text) > 0:
span['content'] = ocr_text span['content'] = ocr_text
else:
spans.remove(span)
return spans return spans
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment