Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
5ecafbfa
Unverified
Commit
5ecafbfa
authored
Nov 28, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 28, 2024
Browse files
Merge pull request #1134 from myhloli/dev
refactor(para): improve language detection and block splitting
parents
e22fa18b
f674b8d4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
5 deletions
+14
-5
magic_pdf/para/para_split_v3.py
magic_pdf/para/para_split_v3.py
+14
-5
No files found.
magic_pdf/para/para_split_v3.py
View file @
5ecafbfa
import
copy
from
loguru
import
logger
from
magic_pdf.config.constants
import
CROSS_PAGE
,
LINES_DELETED
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.libs.language
import
detect_lang
LINE_STOP_FLAG
=
(
'.'
,
...
...
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
# 添加所有文本,包括空行,保持与block['lines']长度一致
lines_text_list
.
append
(
line_text
)
block_text
=
''
.
join
(
lines_text_list
)
block_lang
=
detect_lang
(
block_text
)
# logger.info(f"block_lang: {block_lang}")
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if
abs
(
block
[
'bbox_fs'
][
0
]
-
line
[
'bbox'
][
0
])
<
line_height
/
2
:
...
...
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
if
abs
(
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
])
<
line_height
:
right_close_num
+=
1
else
:
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
# block宽的阈值可以小些,block窄的阈值要大
if
block_weight_radio
>=
0.5
:
# 类中文没有超长单词的情况,可以用统一的阈值
if
block_lang
in
[
'zh'
,
'ja'
,
'ko'
]:
closed_area
=
0.26
*
block_weight
else
:
closed_area
=
0.36
*
block_weight
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
# block宽的阈值可以小些,block窄的阈值要大
if
block_weight_radio
>=
0.5
:
closed_area
=
0.26
*
block_weight
else
:
closed_area
=
0.36
*
block_weight
if
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
]
>
closed_area
:
right_not_close_num
+=
1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment