Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
3b7342b8
Commit
3b7342b8
authored
Jun 18, 2024
by
赵小蒙
Browse files
update cli output files
parent
9dc5033c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
19 additions
and
3 deletions
+19
-3
magic_pdf/cli/magicpdf.py
magic_pdf/cli/magicpdf.py
+19
-3
No files found.
magic_pdf/cli/magicpdf.py
View file @
3b7342b8
...
@@ -100,18 +100,34 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
...
@@ -100,18 +100,34 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
'''写markdown'''
md_writer
.
write
(
md_writer
.
write
(
content
=
md_content
,
path
=
f
"
{
pdf_file_name
}
.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
content
=
md_content
,
path
=
f
"
{
pdf_file_name
}
.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
)
'''写middle_json'''
md_writer
.
write
(
md_writer
.
write
(
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"
{
pdf_file_name
}
.json"
,
path
=
f
"
{
pdf_file_name
}
_middle
.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
)
'''写model_json'''
md_writer
.
write
(
content
=
json_parse
.
dumps
(
pipe
.
model_list
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"
{
pdf_file_name
}
_model.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
'''写源pdf'''
md_writer
.
write
(
content
=
pdf_bytes
,
path
=
f
"
{
pdf_file_name
}
_origin.json"
,
mode
=
AbsReaderWriter
.
MODE_BIN
,
)
content_list
=
pipe
.
pipe_mk_uni_format
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
content_list
=
pipe
.
pipe_mk_uni_format
(
image_dir
,
drop_mode
=
DropMode
.
NONE
)
'''写content_list'''
md_writer
.
write
(
md_writer
.
write
(
str
(
content_list
),
f
"
{
pdf_file_name
}
.txt"
,
AbsReaderWriter
.
MODE_TXT
content
=
json_parse
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"
{
pdf_file_name
}
_content_list.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment