Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b0fd7566
"docs/en/tutorials/customize_runtime.md" did not exist on "4fb544126e07e797e9120885abcba94a84523000"
Commit
b0fd7566
authored
Jun 06, 2025
by
myhloli
Browse files
refactor: update OCR handling and improve function parameters for clarity
parent
9bb25776
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
31 additions
and
15 deletions
+31
-15
mineru/backend/pipeline/model_json_to_middle_json.py
mineru/backend/pipeline/model_json_to_middle_json.py
+19
-8
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
+8
-3
mineru/cli/common.py
mineru/cli/common.py
+4
-4
No files found.
mineru/backend/pipeline/model_json_to_middle_json.py
View file @
b0fd7566
...
@@ -10,6 +10,7 @@ from mineru.utils.block_pre_proc import prepare_block_bboxes, process_groups
...
@@ -10,6 +10,7 @@ from mineru.utils.block_pre_proc import prepare_block_bboxes, process_groups
from
mineru.utils.block_sort
import
sort_blocks_by_bbox
from
mineru.utils.block_sort
import
sort_blocks_by_bbox
from
mineru.utils.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
mineru.utils.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
mineru.utils.cut_image
import
cut_image_and_table
from
mineru.utils.cut_image
import
cut_image_and_table
from
mineru.utils.enum_class
import
ContentType
from
mineru.utils.llm_aided
import
llm_aided_title
from
mineru.utils.llm_aided
import
llm_aided_title
from
mineru.utils.model_utils
import
clean_memory
from
mineru.utils.model_utils
import
clean_memory
from
mineru.backend.pipeline.pipeline_magic_model
import
MagicModel
from
mineru.backend.pipeline.pipeline_magic_model
import
MagicModel
...
@@ -20,7 +21,7 @@ from mineru.version import __version__
...
@@ -20,7 +21,7 @@ from mineru.version import __version__
from
mineru.utils.hash_utils
import
str_md5
from
mineru.utils.hash_utils
import
str_md5
def
page_model_info_to_page_info
(
page_model_info
,
image_dict
,
page
,
image_writer
,
page_index
,
ocr
=
Fals
e
):
def
page_model_info_to_page_info
(
page_model_info
,
image_dict
,
page
,
image_writer
,
page_index
,
ocr
_enable
=
False
,
formula_enabled
=
Tru
e
):
scale
=
image_dict
[
"scale"
]
scale
=
image_dict
[
"scale"
]
page_pil_img
=
image_dict
[
"img_pil"
]
page_pil_img
=
image_dict
[
"img_pil"
]
page_img_md5
=
str_md5
(
image_dict
[
"img_base64"
])
page_img_md5
=
str_md5
(
image_dict
[
"img_base64"
])
...
@@ -62,7 +63,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -62,7 +63,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
block_area
=
(
block
[
'bbox'
][
2
]
-
block
[
'bbox'
][
0
])
*
(
block
[
'bbox'
][
3
]
-
block
[
'bbox'
][
1
])
block_area
=
(
block
[
'bbox'
][
2
]
-
block
[
'bbox'
][
0
])
*
(
block
[
'bbox'
][
3
]
-
block
[
'bbox'
][
1
])
if
block_area
>
0
:
if
block_area
>
0
:
ratio
=
spans_area
/
block_area
ratio
=
spans_area
/
block_area
if
ratio
>
0.25
and
ocr
:
if
ratio
>
0.25
and
ocr
_enable
:
# 移除block的group_id
# 移除block的group_id
block
.
pop
(
'group_id'
,
None
)
block
.
pop
(
'group_id'
,
None
)
# 符合文本图的条件就把块加入到文本块列表中
# 符合文本图的条件就把块加入到文本块列表中
...
@@ -75,8 +76,18 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -75,8 +76,18 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
"""将所有区块的bbox整理到一起"""
"""将所有区块的bbox整理到一起"""
interline_equation_blocks
=
[]
if
formula_enabled
:
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
if
len
(
interline_equation_blocks
)
>
0
:
for
block
in
interline_equation_blocks
:
spans
.
append
({
"type"
:
ContentType
.
INTERLINE_EQUATION
,
'score'
:
block
[
'score'
],
"bbox"
:
block
[
'bbox'
],
})
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
prepare_block_bboxes
(
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
prepare_block_bboxes
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
...
@@ -109,7 +120,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -109,7 +120,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
if
ocr
:
if
ocr
_enable
:
pass
pass
else
:
else
:
"""使用新版本的混合ocr方案."""
"""使用新版本的混合ocr方案."""
...
@@ -125,9 +136,9 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -125,9 +136,9 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
if
len
(
all_bboxes
)
==
0
:
if
len
(
all_bboxes
)
==
0
:
return
None
return
None
"""对image
和
table截图"""
"""对image
/
table
/interline_equation
截图"""
for
span
in
spans
:
for
span
in
spans
:
if
span
[
'type'
]
in
[
'image'
,
'table'
]:
if
span
[
'type'
]
in
[
ContentType
.
IMAGE
,
ContentType
.
TABLE
,
ContentType
.
INTERLINE_EQUATION
]:
span
=
cut_image_and_table
(
span
=
cut_image_and_table
(
span
,
page_pil_img
,
page_img_md5
,
page_index
,
image_writer
,
scale
=
scale
span
,
page_pil_img
,
page_img_md5
,
page_index
,
image_writer
,
scale
=
scale
)
)
...
@@ -150,13 +161,13 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -150,13 +161,13 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
return
page_info
return
page_info
def
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
lang
=
None
,
ocr
=
False
):
def
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
lang
=
None
,
ocr
_enable
=
False
):
middle_json
=
{
"pdf_info"
:
[],
"_backend"
:
"pipeline"
,
"_version_name"
:
__version__
}
middle_json
=
{
"pdf_info"
:
[],
"_backend"
:
"pipeline"
,
"_version_name"
:
__version__
}
for
page_index
,
page_model_info
in
enumerate
(
model_list
):
for
page_index
,
page_model_info
in
enumerate
(
model_list
):
page
=
pdf_doc
[
page_index
]
page
=
pdf_doc
[
page_index
]
image_dict
=
images_list
[
page_index
]
image_dict
=
images_list
[
page_index
]
page_info
=
page_model_info_to_page_info
(
page_info
=
page_model_info_to_page_info
(
page_model_info
,
image_dict
,
page
,
image_writer
,
page_index
,
ocr
=
ocr
page_model_info
,
image_dict
,
page
,
image_writer
,
page_index
,
ocr
_enable
=
ocr_enable
)
)
if
page_info
is
None
:
if
page_info
is
None
:
page_w
,
page_h
=
map
(
int
,
page
.
get_size
())
page_w
,
page_h
=
map
(
int
,
page
.
get_size
())
...
...
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
View file @
b0fd7566
...
@@ -34,7 +34,10 @@ def make_blocks_to_markdown(paras_of_layout,
...
@@ -34,7 +34,10 @@ def make_blocks_to_markdown(paras_of_layout,
title_level
=
get_title_level
(
para_block
)
title_level
=
get_title_level
(
para_block
)
para_text
=
f
'
{
"#"
*
title_level
}
{
merge_para_with_text
(
para_block
)
}
'
para_text
=
f
'
{
"#"
*
title_level
}
{
merge_para_with_text
(
para_block
)
}
'
elif
para_type
==
BlockType
.
INTERLINE_EQUATION
:
elif
para_type
==
BlockType
.
INTERLINE_EQUATION
:
para_text
=
merge_para_with_text
(
para_block
)
if
para_block
[
'lines'
][
0
][
'spans'
][
0
].
get
(
'content'
,
''
):
para_text
=
merge_para_with_text
(
para_block
)
else
:
para_text
+=
f
""
elif
para_type
==
BlockType
.
IMAGE
:
elif
para_type
==
BlockType
.
IMAGE
:
if
mode
==
MakeMode
.
NLP_MD
:
if
mode
==
MakeMode
.
NLP_MD
:
continue
continue
...
@@ -200,9 +203,11 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
...
@@ -200,9 +203,11 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
elif
para_type
==
BlockType
.
INTERLINE_EQUATION
:
elif
para_type
==
BlockType
.
INTERLINE_EQUATION
:
para_content
=
{
para_content
=
{
'type'
:
'equation'
,
'type'
:
'equation'
,
'text'
:
merge_para_with_text
(
para_block
),
'img_path'
:
f
"
{
img_buket_path
}
/
{
para_block
[
'lines'
][
0
][
'spans'
][
0
].
get
(
'image_path'
,
''
)
}
"
,
'text_format'
:
'latex'
,
}
}
if
para_block
[
'lines'
][
0
][
'spans'
][
0
].
get
(
'content'
,
''
):
para_content
[
'text'
]
=
merge_para_with_text
(
para_block
)
para_content
[
'text_format'
]
=
'latex'
elif
para_type
==
BlockType
.
IMAGE
:
elif
para_type
==
BlockType
.
IMAGE
:
para_content
=
{
'type'
:
'image'
,
'img_path'
:
''
,
'img_caption'
:
[],
'img_footnote'
:
[]}
para_content
=
{
'type'
:
'image'
,
'img_path'
:
''
,
'img_caption'
:
[],
'img_footnote'
:
[]}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
...
...
mineru/cli/common.py
View file @
b0fd7566
...
@@ -110,8 +110,8 @@ def do_parse(
...
@@ -110,8 +110,8 @@ def do_parse(
images_list
=
all_image_lists
[
idx
]
images_list
=
all_image_lists
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr
=
ocr_enabled_list
[
idx
]
_ocr
_enable
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr
)
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr
_enable
)
pdf_info
=
middle_json
[
"pdf_info"
]
pdf_info
=
middle_json
[
"pdf_info"
]
...
@@ -215,8 +215,8 @@ def do_parse(
...
@@ -215,8 +215,8 @@ def do_parse(
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
#
pdf_path = "../../demo/pdfs/
hello-algo-1.1.0-zh-c-word转换的span有问题
.pdf"
pdf_path
=
"../../demo/pdfs/
demo2
.pdf"
pdf_path
=
"C:/Users/zhaoxiaomeng/Downloads/数学新星问题征解第一期(2014.03).pdf"
#
pdf_path = "C:/Users/zhaoxiaomeng/Downloads/数学新星问题征解第一期(2014.03).pdf"
try
:
try
:
do_parse
(
"./output"
,
[
Path
(
pdf_path
).
stem
],
[
read_fn
(
Path
(
pdf_path
))],[
"ch"
],
end_page_id
=
20
,)
do_parse
(
"./output"
,
[
Path
(
pdf_path
).
stem
],
[
read_fn
(
Path
(
pdf_path
))],[
"ch"
],
end_page_id
=
20
,)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment