Unverified Commit 68c45530 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1140 from myhloli/dev

refactor(pdf_parse): adjust character-axis alignment algorithm
parents 086b48b7 d4345b6e
...@@ -117,7 +117,7 @@ def fill_char_in_spans(spans, all_chars): ...@@ -117,7 +117,7 @@ def fill_char_in_spans(spans, all_chars):
# 使用鲁棒性更强的中心点坐标判断 # 使用鲁棒性更强的中心点坐标判断
def calculate_char_in_span(char_bbox, span_bbox, char): def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2 char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2 char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2 span_center_y = (span_bbox[1] + span_bbox[3]) / 2
...@@ -126,7 +126,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char): ...@@ -126,7 +126,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
if ( if (
span_bbox[0] < char_center_x < span_bbox[2] span_bbox[0] < char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3] and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度 and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
): ):
return True return True
else: else:
...@@ -137,7 +137,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char): ...@@ -137,7 +137,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2] (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
and char_center_x > span_bbox[0] and char_center_x > span_bbox[0]
and span_bbox[1] < char_center_y < span_bbox[3] and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4 and abs(char_center_y - span_center_y) < span_height * span_height_radio
): ):
return True return True
elif char in LINE_START_FLAG: elif char in LINE_START_FLAG:
...@@ -145,7 +145,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char): ...@@ -145,7 +145,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height) span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
and char_center_x < span_bbox[2] and char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3] and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4 and abs(char_center_y - span_center_y) < span_height * span_height_radio
): ):
return True return True
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment