Commit 99192002 authored by myhloli's avatar myhloli
Browse files

refactor: improve character span calculation and sorting logic

parent 546be00a
...@@ -344,7 +344,8 @@ def fill_char_in_spans(spans, all_chars): ...@@ -344,7 +344,8 @@ def fill_char_in_spans(spans, all_chars):
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',) LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',) LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): Span_Height_Radio = 0.33 # 字符的中轴和span的中轴高度差不能超过1/3span高度
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=Span_Height_Radio):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2 char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2 char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2 span_center_y = (span_bbox[1] + span_bbox[3]) / 2
...@@ -353,7 +354,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): ...@@ -353,7 +354,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
if ( if (
span_bbox[0] < char_center_x < span_bbox[2] span_bbox[0] < char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3] and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度 and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过Span_Height_Radio
): ):
return True return True
else: else:
...@@ -385,7 +386,10 @@ def chars_to_content(span): ...@@ -385,7 +386,10 @@ def chars_to_content(span):
pass pass
else: else:
# 先给chars按char['bbox']的中心点的x坐标排序 # 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2) # span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
# 给chars按char_idx排序
span['chars'] = sorted(span['chars'], key=lambda x: x['char_idx'])
# Calculate the width of each character # Calculate the width of each character
char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']] char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
...@@ -393,7 +397,7 @@ def chars_to_content(span): ...@@ -393,7 +397,7 @@ def chars_to_content(span):
median_width = statistics.median(char_widths) median_width = statistics.median(char_widths)
# 通过x轴重叠比率移除一部分char # 通过x轴重叠比率移除一部分char
span = remove_x_overlapping_chars(span, median_width) # span = remove_x_overlapping_chars(span, median_width)
content = '' content = ''
for char in span['chars']: for char in span['chars']:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment