Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
4359b36f
Commit
4359b36f
authored
Jun 09, 2025
by
myhloli
Browse files
refactor: introduce SplitFlag class and update references in para_split and vlm_magic_model
parent
a29489ef
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
45 additions
and
12 deletions
+45
-12
mineru/backend/pipeline/para_split.py
mineru/backend/pipeline/para_split.py
+5
-7
mineru/backend/vlm/vlm_magic_model.py
mineru/backend/vlm/vlm_magic_model.py
+33
-3
mineru/backend/vlm/vlm_middle_json_mkcontent.py
mineru/backend/vlm/vlm_middle_json_mkcontent.py
+1
-1
mineru/utils/enum_class.py
mineru/utils/enum_class.py
+6
-1
No files found.
mineru/backend/pipeline/para_split.py
View file @
4359b36f
import
copy
import
copy
from
loguru
import
logger
from
loguru
import
logger
from
mineru.utils.enum_class
import
ContentType
,
BlockType
from
mineru.utils.enum_class
import
ContentType
,
BlockType
,
SplitFlag
from
mineru.utils.language
import
detect_lang
from
mineru.utils.language
import
detect_lang
CROSS_PAGE
=
'cross_page'
LINES_DELETED
=
'lines_deleted'
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
)
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
)
LIST_END_FLAG
=
(
'.'
,
'。'
,
';'
,
';'
)
LIST_END_FLAG
=
(
'.'
,
'。'
,
';'
,
';'
)
...
@@ -284,10 +282,10 @@ def __merge_2_text_blocks(block1, block2):
...
@@ -284,10 +282,10 @@ def __merge_2_text_blocks(block1, block2):
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
for
line
in
block1
[
'lines'
]:
for
line
in
block1
[
'lines'
]:
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
span
[
CROSS_PAGE
]
=
True
span
[
SplitFlag
.
CROSS_PAGE
]
=
True
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
block1
[
'lines'
]
=
[]
block1
[
'lines'
]
=
[]
block1
[
LINES_DELETED
]
=
True
block1
[
SplitFlag
.
LINES_DELETED
]
=
True
return
block1
,
block2
return
block1
,
block2
...
@@ -296,10 +294,10 @@ def __merge_2_list_blocks(block1, block2):
...
@@ -296,10 +294,10 @@ def __merge_2_list_blocks(block1, block2):
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
for
line
in
block1
[
'lines'
]:
for
line
in
block1
[
'lines'
]:
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
span
[
CROSS_PAGE
]
=
True
span
[
SplitFlag
.
CROSS_PAGE
]
=
True
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
block1
[
'lines'
]
=
[]
block1
[
'lines'
]
=
[]
block1
[
LINES_DELETED
]
=
True
block1
[
SplitFlag
.
LINES_DELETED
]
=
True
return
block1
,
block2
return
block1
,
block2
...
...
mineru/backend/vlm/vlm_magic_model.py
View file @
4359b36f
...
@@ -2,7 +2,7 @@ import re
...
@@ -2,7 +2,7 @@ import re
from
typing
import
Literal
from
typing
import
Literal
from
mineru.utils.boxbase
import
bbox_distance
,
is_in
from
mineru.utils.boxbase
import
bbox_distance
,
is_in
from
mineru.utils.enum_class
import
BlockType
,
ContentType
from
mineru.utils.enum_class
import
ContentType
,
BlockType
,
SplitFlag
from
mineru.backend.vlm.vlm_middle_json_mkcontent
import
merge_para_with_text
from
mineru.backend.vlm.vlm_middle_json_mkcontent
import
merge_para_with_text
from
mineru.utils.format_utils
import
convert_otsl_to_html
from
mineru.utils.format_utils
import
convert_otsl_to_html
...
@@ -187,7 +187,7 @@ class MagicModel:
...
@@ -187,7 +187,7 @@ class MagicModel:
return
fix_title_blocks
(
self
.
title_blocks
)
return
fix_title_blocks
(
self
.
title_blocks
)
def
get_text_blocks
(
self
):
def
get_text_blocks
(
self
):
return
self
.
text_blocks
return
fix_text_blocks
(
self
.
text_blocks
)
def
get_interline_equation_blocks
(
self
):
def
get_interline_equation_blocks
(
self
):
return
self
.
interline_equation_blocks
return
self
.
interline_equation_blocks
...
@@ -440,4 +440,34 @@ def count_leading_hashes(text):
...
@@ -440,4 +440,34 @@ def count_leading_hashes(text):
def
strip_leading_hashes
(
text
):
def
strip_leading_hashes
(
text
):
# 去除开头的#和紧随其后的空格
# 去除开头的#和紧随其后的空格
return
re
.
sub
(
r
'^#+\s*'
,
''
,
text
)
return
re
.
sub
(
r
'^#+\s*'
,
''
,
text
)
\ No newline at end of file
def
fix_text_blocks
(
blocks
):
i
=
0
while
i
<
len
(
blocks
):
block
=
blocks
[
i
]
last_line
=
block
[
"lines"
][
-
1
]
if
block
[
"lines"
]
else
None
if
last_line
:
last_span
=
last_line
[
"spans"
][
-
1
]
if
last_line
[
"spans"
]
else
None
if
last_span
and
last_span
[
'content'
].
endswith
(
'<|txt_contd|>'
):
last_span
[
'content'
]
=
last_span
[
'content'
][:
-
len
(
'<|txt_contd|>'
)]
# 查找下一个未被清空的块
next_idx
=
i
+
1
while
next_idx
<
len
(
blocks
)
and
blocks
[
next_idx
].
get
(
SplitFlag
.
LINES_DELETED
,
False
):
next_idx
+=
1
# 如果找到下一个有效块,则合并
if
next_idx
<
len
(
blocks
):
next_block
=
blocks
[
next_idx
]
# 将下一个块的lines扩展到当前块的lines中
block
[
"lines"
].
extend
(
next_block
[
"lines"
])
# 清空下一个块的lines
next_block
[
"lines"
]
=
[]
# 在下一个块中添加标志
next_block
[
SplitFlag
.
LINES_DELETED
]
=
True
# 不增加i,继续检查当前块(现在已包含下一个块的内容)
continue
i
+=
1
return
blocks
\ No newline at end of file
mineru/backend/vlm/vlm_middle_json_mkcontent.py
View file @
4359b36f
...
@@ -28,7 +28,7 @@ def merge_para_with_text(para_block):
...
@@ -28,7 +28,7 @@ def merge_para_with_text(para_block):
content
=
f
"
{
inline_left_delimiter
}{
span
[
'content'
]
}{
inline_right_delimiter
}
"
content
=
f
"
{
inline_left_delimiter
}{
span
[
'content'
]
}{
inline_right_delimiter
}
"
elif
span_type
==
ContentType
.
INTERLINE_EQUATION
:
elif
span_type
==
ContentType
.
INTERLINE_EQUATION
:
content
=
f
"
\n
{
display_left_delimiter
}
\n
{
span
[
'content'
]
}
\n
{
display_right_delimiter
}
\n
"
content
=
f
"
\n
{
display_left_delimiter
}
\n
{
span
[
'content'
]
}
\n
{
display_right_delimiter
}
\n
"
content
=
content
.
strip
()
#
content = content.strip()
if
content
:
if
content
:
if
span_type
in
[
ContentType
.
TEXT
,
ContentType
.
INLINE_EQUATION
]:
if
span_type
in
[
ContentType
.
TEXT
,
ContentType
.
INLINE_EQUATION
]:
if
j
==
len
(
line
[
'spans'
])
-
1
:
if
j
==
len
(
line
[
'spans'
])
-
1
:
...
...
mineru/utils/enum_class.py
View file @
4359b36f
...
@@ -54,4 +54,9 @@ class ModelPath:
...
@@ -54,4 +54,9 @@ class ModelPath:
pytorch_paddle
=
"models/OCR/paddleocr_torch"
pytorch_paddle
=
"models/OCR/paddleocr_torch"
layout_reader
=
"models/ReadingOrder/layout_reader"
layout_reader
=
"models/ReadingOrder/layout_reader"
vlm_root_hf
=
"opendatalab/MinerU-VLM-1.0"
vlm_root_hf
=
"opendatalab/MinerU-VLM-1.0"
vlm_root_modelscope
=
"OpenDataLab/MinerU-VLM-1.0"
vlm_root_modelscope
=
"OpenDataLab/MinerU-VLM-1.0"
\ No newline at end of file
class
SplitFlag
:
CROSS_PAGE
=
'cross_page'
LINES_DELETED
=
'lines_deleted'
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment