"eigen-master/bench/check_cache_queries.cpp" did not exist on "e7df86554156b36846008d8ddbcc4d8521a16554"
Commit 99192002 authored by myhloli's avatar myhloli
Browse files

refactor: improve character span calculation and sorting logic

parent 546be00a
......@@ -344,7 +344,8 @@ def fill_char_in_spans(spans, all_chars):
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
Span_Height_Radio = 0.33 # 字符的中轴和span的中轴高度差不能超过1/3span高度
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=Span_Height_Radio):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
......@@ -353,7 +354,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
if (
span_bbox[0] < char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过Span_Height_Radio
):
return True
else:
......@@ -385,7 +386,10 @@ def chars_to_content(span):
pass
else:
# 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
# span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
# 给chars按char_idx排序
span['chars'] = sorted(span['chars'], key=lambda x: x['char_idx'])
# Calculate the width of each character
char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
......@@ -393,7 +397,7 @@ def chars_to_content(span):
median_width = statistics.median(char_widths)
# 通过x轴重叠比率移除一部分char
span = remove_x_overlapping_chars(span, median_width)
# span = remove_x_overlapping_chars(span, median_width)
content = ''
for char in span['chars']:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment