Commit 949d0867 authored by myhloli's avatar myhloli
Browse files

feat(pdf_parse): add line start flag detection and optimize line stop flag logic

- Add LINE_START_FLAG tuple to identify starting flags of a line
- Modify calculate_char_in_span function to handle both line start and stop flags
- Remove redundant char_is_line_stop_flag variable and simplify logic
- Improve line flag detection to enhance text extraction accuracy
parent ac888156
...@@ -90,6 +90,9 @@ def chars_to_content(span): ...@@ -90,6 +90,9 @@ def chars_to_content(span):
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',) LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
def fill_char_in_spans(spans, all_chars): def fill_char_in_spans(spans, all_chars):
# 简单从上到下排一下序 # 简单从上到下排一下序
...@@ -97,12 +100,7 @@ def fill_char_in_spans(spans, all_chars): ...@@ -97,12 +100,7 @@ def fill_char_in_spans(spans, all_chars):
for char in all_chars: for char in all_chars:
for span in spans: for span in spans:
# 判断char是否属于LINE_STOP_FLAG if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
if char['c'] in LINE_STOP_FLAG:
char_is_line_stop_flag = True
else:
char_is_line_stop_flag = False
if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
span['chars'].append(char) span['chars'].append(char)
break break
...@@ -119,7 +117,7 @@ def fill_char_in_spans(spans, all_chars): ...@@ -119,7 +117,7 @@ def fill_char_in_spans(spans, all_chars):
# 使用鲁棒性更强的中心点坐标判断 # 使用鲁棒性更强的中心点坐标判断
def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): def calculate_char_in_span(char_bbox, span_bbox, char):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2 char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2 char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2 span_center_y = (span_bbox[1] + span_bbox[3]) / 2
...@@ -134,7 +132,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): ...@@ -134,7 +132,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
else: else:
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致) # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近 # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
if char_is_line_stop_flag: if char in LINE_STOP_FLAG:
if ( if (
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2] (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
and char_center_x > span_bbox[0] and char_center_x > span_bbox[0]
...@@ -142,6 +140,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): ...@@ -142,6 +140,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
and abs(char_center_y - span_center_y) < span_height / 4 and abs(char_center_y - span_center_y) < span_height / 4
): ):
return True return True
elif char in LINE_START_FLAG:
if (
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
and char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4
):
return True
else: else:
return False return False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment