Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
ef0129ad
Commit
ef0129ad
authored
Apr 23, 2024
by
kernel.h@qq.com
Browse files
修改pdf的路径
parent
ed40e1d5
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
11 deletions
+16
-11
magic_pdf/cli/magicpdf.py
magic_pdf/cli/magicpdf.py
+16
-11
No files found.
magic_pdf/cli/magicpdf.py
View file @
ef0129ad
...
@@ -23,9 +23,9 @@ python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloa
...
@@ -23,9 +23,9 @@ python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloa
import
os
import
os
import
json
as
json_parse
import
json
as
json_parse
from
datetime
import
datetime
import
click
import
click
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
...
@@ -44,9 +44,9 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
...
@@ -44,9 +44,9 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
def
prepare_env
():
def
prepare_env
(
pdf_file_name
):
local_parent_dir
=
os
.
path
.
join
(
local_parent_dir
=
os
.
path
.
join
(
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
().
strftime
(
"%Y-%m-%d"
)
get_local_dir
(),
"magic-pdf"
,
pdf_file_name
)
)
local_image_dir
=
os
.
path
.
join
(
local_parent_dir
,
"images"
)
local_image_dir
=
os
.
path
.
join
(
local_parent_dir
,
"images"
)
...
@@ -56,7 +56,7 @@ def prepare_env():
...
@@ -56,7 +56,7 @@ def prepare_env():
return
local_image_dir
,
local_md_dir
return
local_image_dir
,
local_md_dir
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
def
_do_parse
(
pdf_file_name
,
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
if
parse_method
==
"auto"
:
if
parse_method
==
"auto"
:
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
elif
parse_method
==
"txt"
:
elif
parse_method
==
"txt"
:
...
@@ -70,13 +70,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
...
@@ -70,13 +70,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
pipe
.
pipe_classify
()
pipe
.
pipe_classify
()
pipe
.
pipe_parse
()
pipe
.
pipe_parse
()
md_content
=
pipe
.
pipe_mk_markdown
()
md_content
=
pipe
.
pipe_mk_markdown
()
part_file_name
=
datetime
.
now
().
strftime
(
"%H-%M-%S"
)
#
part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer
.
write
(
md_writer
.
write
(
content
=
md_content
,
path
=
f
"
{
p
art
_file_name
}
.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
content
=
md_content
,
path
=
f
"
{
p
df
_file_name
}
.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
)
md_writer
.
write
(
md_writer
.
write
(
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"
{
p
art
_file_name
}
.json"
,
path
=
f
"
{
p
df
_file_name
}
.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
)
# try:
# try:
...
@@ -127,14 +127,17 @@ def json_command(json, method):
...
@@ -127,14 +127,17 @@ def json_command(json, method):
)
)
jso
=
json_parse
.
loads
(
read_s3_path
(
json
).
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_s3_path
(
json
).
decode
(
"utf-8"
))
pdf_data
=
read_s3_path
(
jso
[
"file_location"
])
s3_file_path
=
jso
[
"file_location"
]
local_image_dir
,
local_md_dir
=
prepare_env
()
pdf_file_name
=
Path
(
s3_file_path
).
stem
pdf_data
=
read_s3_path
(
s3_file_path
)
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
local_md_dir
)
)
_do_parse
(
_do_parse
(
pdf_file_name
,
pdf_data
,
pdf_data
,
jso
[
"doc_layout_result"
],
jso
[
"doc_layout_result"
],
method
,
method
,
...
@@ -169,11 +172,13 @@ def pdf_command(pdf, model, method):
...
@@ -169,11 +172,13 @@ def pdf_command(pdf, model, method):
pdf_data
=
read_fn
(
pdf
)
pdf_data
=
read_fn
(
pdf
)
jso
=
json_parse
.
loads
(
read_fn
(
model
).
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_fn
(
model
).
decode
(
"utf-8"
))
local_image_dir
,
local_md_dir
=
prepare_env
()
pdf_file_name
=
Path
(
pdf
).
stem
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
local_md_dir
)
)
_do_parse
(
_do_parse
(
pdf_file_name
,
pdf_data
,
pdf_data
,
jso
,
jso
,
method
,
method
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment