Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
3e8ee23e
"docs/vscode:/vscode.git/clone" did not exist on "5a533e79b4635c2b62176d415a8ae2a38dab46f1"
Commit
3e8ee23e
authored
Apr 03, 2025
by
icecraft
Browse files
fix: convert image with pymupdf
parent
3379f3b3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
3 deletions
+10
-3
magic_pdf/tools/cli.py
magic_pdf/tools/cli.py
+10
-3
No files found.
magic_pdf/tools/cli.py
View file @
3e8ee23e
...
...
@@ -137,10 +137,17 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
doc_paths
=
[]
for
doc_path
in
Path
(
path
).
glob
(
'*'
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
+
ms_office_suffixes
:
if
doc_path
.
suffix
not
in
ms_office_suffixes
:
basename
=
Path
(
doc_path
).
stem
if
doc_path
.
suffix
in
ms_office_suffixes
:
convert_file_to_pdf
(
str
(
doc_path
),
temp_dir
)
doc_path
=
Path
(
os
.
path
.
join
(
temp_dir
,
f
'
{
basename
}
.pdf'
))
doc_path
=
Path
(
os
.
path
.
join
(
temp_dir
,
f
'
{
doc_path
.
stem
}
.pdf'
))
elif
doc_path
.
suffix
in
image_suffixes
:
with
open
(
str
(
doc_path
),
'rb'
)
as
f
:
bits
=
f
.
read
()
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
fn
=
os
.
path
.
join
(
temp_dir
,
f
'
{
doc_path
.
stem
}
.pdf'
)
with
open
(
fn
,
'wb'
)
as
f
:
f
.
write
(
pdf_bytes
)
doc_path
=
Path
(
fn
)
doc_paths
.
append
(
doc_path
)
datasets
=
batch_build_dataset
(
doc_paths
,
4
,
lang
)
batch_do_parse
(
output_dir
,
[
str
(
doc_path
.
stem
)
for
doc_path
in
doc_paths
],
datasets
,
method
,
debug_able
,
lang
=
lang
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment