Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
51393aa8
Commit
51393aa8
authored
Jun 03, 2025
by
myhloli
Browse files
refactor: update union_make import and adjust middle JSON structure for consistency
parent
6b1df419
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
22 deletions
+21
-22
mineru/backend/pipeline/model_json_to_middle_json.py
mineru/backend/pipeline/model_json_to_middle_json.py
+1
-1
mineru/backend/pipeline/pipeline_analyze.py
mineru/backend/pipeline/pipeline_analyze.py
+0
-2
mineru/cli/common.py
mineru/cli/common.py
+20
-19
No files found.
mineru/backend/pipeline/model_json_to_middle_json.py
View file @
51393aa8
...
...
@@ -117,7 +117,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
def
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
lang
=
None
,
ocr
=
False
):
middle_json
=
{
"pdf_info"
:
[],
"_backend"
:
"
vlm
"
,
"_version_name"
:
__version__
}
middle_json
=
{
"pdf_info"
:
[],
"_backend"
:
"
pipeline
"
,
"_version_name"
:
__version__
}
for
page_index
,
page_model_info
in
enumerate
(
model_list
):
page
=
pdf_doc
[
page_index
]
image_dict
=
images_list
[
page_index
]
...
...
mineru/backend/pipeline/pipeline_analyze.py
View file @
51393aa8
...
...
@@ -5,8 +5,6 @@ import torch
from
.model_init
import
MineruPipelineModel
from
.config_reader
import
get_local_models_dir
,
get_device
,
get_formula_config
,
get_table_recog_config
from
.model_json_to_middle_json
import
result_to_middle_json
from
...data.data_reader_writer
import
DataWriter
from
...utils.pdf_classify
import
classify
from
...utils.pdf_image_tools
import
load_images_from_pdf
...
...
mineru/cli/common.py
View file @
51393aa8
...
...
@@ -2,13 +2,14 @@
import
io
import
json
import
os
import
copy
from
pathlib
import
Path
import
pypdfium2
as
pdfium
from
loguru
import
logger
from
mineru.backend.pipeline.model_json_to_middle_json
import
result_to_middle_json
as
pipeline_result_to_middle_json
from
mineru.api.vlm_middle_json_mkcontent
import
union_make
from
mineru.api.vlm_middle_json_mkcontent
import
union_make
as
vlm_union_make
from
mineru.backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
mineru.backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
from
mineru.data.data_reader_writer
import
FileBasedDataWriter
...
...
@@ -98,8 +99,8 @@ def do_parse(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
p_formula_enable
,
table_enable
=
p_table_enable
)
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
pdf_file_name
=
pdf_file_names
[
idx
]
model_json
=
infer_results
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
...
...
@@ -124,21 +125,21 @@ def do_parse(
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
#
if f_dump_md:
#
image_dir = str(os.path.basename(local_image_dir))
#
md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
#
md_writer.write_string(
#
f"{pdf_file_name}.md",
#
md_content_str,
#
)
#
if f_dump_content_list:
#
image_dir = str(os.path.basename(local_image_dir))
#
content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
#
md_writer.write_string(
#
f"{pdf_file_name}_content_list.json",
#
json.dumps(content_list, ensure_ascii=False, indent=4),
#
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
...
...
@@ -179,7 +180,7 @@ def do_parse(
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_content_str
=
vlm_
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
...
...
@@ -187,7 +188,7 @@ def do_parse(
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
content_list
=
vlm_
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment