Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
51393aa8
Commit
51393aa8
authored
Jun 03, 2025
by
myhloli
Browse files
refactor: update union_make import and adjust middle JSON structure for consistency
parent
6b1df419
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
22 deletions
+21
-22
mineru/backend/pipeline/model_json_to_middle_json.py
mineru/backend/pipeline/model_json_to_middle_json.py
+1
-1
mineru/backend/pipeline/pipeline_analyze.py
mineru/backend/pipeline/pipeline_analyze.py
+0
-2
mineru/cli/common.py
mineru/cli/common.py
+20
-19
No files found.
mineru/backend/pipeline/model_json_to_middle_json.py
View file @
51393aa8
...
@@ -117,7 +117,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -117,7 +117,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
def
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
lang
=
None
,
ocr
=
False
):
def
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
lang
=
None
,
ocr
=
False
):
middle_json
=
{
"pdf_info"
:
[],
"_backend"
:
"
vlm
"
,
"_version_name"
:
__version__
}
middle_json
=
{
"pdf_info"
:
[],
"_backend"
:
"
pipeline
"
,
"_version_name"
:
__version__
}
for
page_index
,
page_model_info
in
enumerate
(
model_list
):
for
page_index
,
page_model_info
in
enumerate
(
model_list
):
page
=
pdf_doc
[
page_index
]
page
=
pdf_doc
[
page_index
]
image_dict
=
images_list
[
page_index
]
image_dict
=
images_list
[
page_index
]
...
...
mineru/backend/pipeline/pipeline_analyze.py
View file @
51393aa8
...
@@ -5,8 +5,6 @@ import torch
...
@@ -5,8 +5,6 @@ import torch
from
.model_init
import
MineruPipelineModel
from
.model_init
import
MineruPipelineModel
from
.config_reader
import
get_local_models_dir
,
get_device
,
get_formula_config
,
get_table_recog_config
from
.config_reader
import
get_local_models_dir
,
get_device
,
get_formula_config
,
get_table_recog_config
from
.model_json_to_middle_json
import
result_to_middle_json
from
...data.data_reader_writer
import
DataWriter
from
...utils.pdf_classify
import
classify
from
...utils.pdf_classify
import
classify
from
...utils.pdf_image_tools
import
load_images_from_pdf
from
...utils.pdf_image_tools
import
load_images_from_pdf
...
...
mineru/cli/common.py
View file @
51393aa8
...
@@ -2,13 +2,14 @@
...
@@ -2,13 +2,14 @@
import
io
import
io
import
json
import
json
import
os
import
os
import
copy
from
pathlib
import
Path
from
pathlib
import
Path
import
pypdfium2
as
pdfium
import
pypdfium2
as
pdfium
from
loguru
import
logger
from
loguru
import
logger
from
mineru.backend.pipeline.model_json_to_middle_json
import
result_to_middle_json
as
pipeline_result_to_middle_json
from
mineru.backend.pipeline.model_json_to_middle_json
import
result_to_middle_json
as
pipeline_result_to_middle_json
from
mineru.api.vlm_middle_json_mkcontent
import
union_make
from
mineru.api.vlm_middle_json_mkcontent
import
union_make
as
vlm_union_make
from
mineru.backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
mineru.backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
mineru.backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
from
mineru.backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
from
mineru.data.data_reader_writer
import
FileBasedDataWriter
from
mineru.data.data_reader_writer
import
FileBasedDataWriter
...
@@ -98,8 +99,8 @@ def do_parse(
...
@@ -98,8 +99,8 @@ def do_parse(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
p_formula_enable
,
table_enable
=
p_table_enable
)
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
p_formula_enable
,
table_enable
=
p_table_enable
)
for
idx
,
model_list
in
enumerate
(
infer_results
):
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
pdf_file_name
=
pdf_file_names
[
idx
]
pdf_file_name
=
pdf_file_names
[
idx
]
model_json
=
infer_results
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
...
@@ -124,21 +125,21 @@ def do_parse(
...
@@ -124,21 +125,21 @@ def do_parse(
pdf_bytes
,
pdf_bytes
,
)
)
if
f_dump_md
:
#
if f_dump_md:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
#
image_dir = str(os.path.basename(local_image_dir))
md_content_str
=
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
#
md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
md_writer
.
write_string
(
#
md_writer.write_string(
f
"
{
pdf_file_name
}
.md"
,
#
f"{pdf_file_name}.md",
md_content_str
,
#
md_content_str,
)
#
)
if
f_dump_content_list
:
#
if f_dump_content_list:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
#
image_dir = str(os.path.basename(local_image_dir))
content_list
=
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
#
content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
md_writer
.
write_string
(
#
md_writer.write_string(
f
"
{
pdf_file_name
}
_content_list.json"
,
#
f"{pdf_file_name}_content_list.json",
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
#
json.dumps(content_list, ensure_ascii=False, indent=4),
)
#
)
if
f_dump_middle_json
:
if
f_dump_middle_json
:
md_writer
.
write_string
(
md_writer
.
write_string
(
...
@@ -179,7 +180,7 @@ def do_parse(
...
@@ -179,7 +180,7 @@ def do_parse(
if
f_dump_md
:
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_content_str
=
vlm_
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
md_content_str
,
...
@@ -187,7 +188,7 @@ def do_parse(
...
@@ -187,7 +188,7 @@ def do_parse(
if
f_dump_content_list
:
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
content_list
=
vlm_
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
md_writer
.
write_string
(
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment