"git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "c0d009044af7f643f08cc1373925c6ce7bbd0fce"
Commit 7964ae45 authored by myhloli's avatar myhloli
Browse files

refactor(pdf_parse): improve code readability and maintainability

parent 97bcc8b2
...@@ -89,29 +89,29 @@ def __replace_STX_ETX(text_str: str): ...@@ -89,29 +89,29 @@ def __replace_STX_ETX(text_str: str):
def chars_to_content(span): def chars_to_content(span):
# # 先给chars按char['bbox']的x坐标排序 # # 先给chars按char['bbox']的x坐标排序
# span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0]) # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
# 先给chars按char['bbox']的中心点的x坐标排序 # 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2) span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
content = '' content = ''
# 求char的平均宽度 # 求char的平均宽度
if len(span['chars']) == 0: if len(span['chars']) == 0:
span['content'] = content span['content'] = content
del span['chars']
return
else:
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
char_avg_width = char_width_sum / len(span['chars'])
for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' '
content += char['c']
span['content'] = __replace_STX_ETX(content)
del span['chars'] del span['chars']
return
else:
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
char_avg_width = char_width_sum / len(span['chars'])
for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' '
content += char['c']
span['content'] = __replace_STX_ETX(content)
del span['chars']
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',) LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment