"vscode:/vscode.git/clone" did not exist on "c9169a69e4adcde104c5805e9e6772f3a1bcd2e3"
Unverified Commit 380cb4d9 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1838 from myhloli/dev

refactor(magic_pdf): improve paragraph splitting logic and update dep…
parents 7d99c1f6 842483cc
......@@ -108,30 +108,33 @@ def __is_list_or_index_block(block):
):
multiple_para_flag = True
for line in block['lines']:
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
if (
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
):
external_sides_not_close_num += 1
if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1
block_text = ''
for line in block['lines']:
line_text = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
# 添加所有文本,包括空行,保持与block['lines']长度一致
lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
for line in block['lines']:
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
if (
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
):
external_sides_not_close_num += 1
if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
left_close_num += 1
......
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
fast-langdetect>=0.2.3
fast-langdetect>=0.2.3,<0.3.0
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
pydantic>=2.7.2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment