Commit ba6c17a9 authored by myhloli's avatar myhloli
Browse files

feat(pdf_parse): remove tilted lines for better text extraction

- Add remove_tilted_line function to filter out lines with angles between 2 and 88 degrees
- Integrate the new function into the text extraction process
- Improve the accuracy of text block processing by removing non-horizontal/vertical lines
parent 48a43370
import copy import copy
import math
import os import os
import re import re
import statistics import statistics
...@@ -173,6 +174,21 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): ...@@ -173,6 +174,21 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
return False return False
def remove_tilted_line(text_blocks):
for block in text_blocks:
remove_lines = []
for line in block['lines']:
cosine, sine = line['dir']
# 计算弧度值
angle_radians = math.atan2(sine, cosine)
# 将弧度值转换为角度值
angle_degrees = math.degrees(angle_radians)
if 2 < abs(angle_degrees) < 88:
remove_lines.append(line)
for line in remove_lines:
block['lines'].remove(line)
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
# cid用0xfffd表示,连字符拆开 # cid用0xfffd表示,连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
...@@ -183,6 +199,10 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -183,6 +199,10 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
# 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用 # 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] # text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# 移除所有角度不为0或90的line
remove_tilted_line(text_blocks_raw)
all_pymu_chars = [] all_pymu_chars = []
for block in text_blocks_raw: for block in text_blocks_raw:
for line in block['lines']: for line in block['lines']:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment