Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
bfabafff
Commit
bfabafff
authored
Apr 16, 2024
by
许瑞
Browse files
feat: update cli
parent
64e5d1b0
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
54 additions
and
23 deletions
+54
-23
magic_pdf/cli/magicpdf.py
magic_pdf/cli/magicpdf.py
+54
-23
No files found.
magic_pdf/cli/magicpdf.py
View file @
bfabafff
...
@@ -21,7 +21,11 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
...
@@ -21,7 +21,11 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
"""
"""
import
os
import
json
as
json_parse
from
datetime
import
datetime
import
click
import
click
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.libs.config_reader
import
get_s3_config
from
magic_pdf.libs.config_reader
import
get_s3_config
from
magic_pdf.libs.path_utils
import
(
from
magic_pdf.libs.path_utils
import
(
parse_s3path
,
parse_s3path
,
...
@@ -29,25 +33,14 @@ from magic_pdf.libs.path_utils import (
...
@@ -29,25 +33,14 @@ from magic_pdf.libs.path_utils import (
remove_non_official_s3_args
,
remove_non_official_s3_args
,
)
)
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.libs.config_reader
import
get_local_dir
from
magic_pdf.io.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
from
magic_pdf.io.S3ReaderWriter
import
S3ReaderWriter
,
MODE_BIN
,
MODE_TXT
from
magic_pdf.io.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.io.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.spark.spark_api
import
parse_union_pdf
,
parse_txt_pdf
,
parse_ocr_pdf
from
magic_pdf.libs.json_compressor
import
JsonCompressor
import
os
import
json
as
json_parse
from
datetime
import
datetime
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
def
get_pdf_parse_method
(
method
):
if
method
==
"ocr"
:
return
parse_ocr_pdf
elif
method
==
"txt"
:
return
parse_txt_pdf
return
parse_union_pdf
def
prepare_env
():
def
prepare_env
():
local_parent_dir
=
os
.
path
.
join
(
local_parent_dir
=
os
.
path
.
join
(
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
().
strftime
(
"%Y-%m-%d"
)
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
().
strftime
(
"%Y-%m-%d"
)
...
@@ -60,6 +53,28 @@ def prepare_env():
...
@@ -60,6 +53,28 @@ def prepare_env():
return
local_image_dir
,
local_md_dir
return
local_image_dir
,
local_md_dir
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
uni_pipe
=
UNIPipe
()
jso_useful_key
=
{
"_pdf_type"
:
"txt"
,
"model_list"
:
model_list
,
}
if
parse_method
==
"ocr"
:
jso_useful_key
[
"_pdf_type"
]
=
"ocr"
pdf_mid_data
=
uni_pipe
.
parse
(
pdf_bytes
,
image_writer
,
jso_useful_key
)
md_content
=
UNIPipe
.
mk_markdown
(
pdf_mid_data
,
image_dir
)
part_file_name
=
datetime
.
now
().
strftime
(
"%H-%M-%S"
)
md_writer
.
write
(
content
=
md_content
,
path
=
f
"
{
part_file_name
}
.md"
,
mode
=
MODE_TXT
)
md_writer
.
write
(
content
=
json_parse
.
dumps
(
JsonCompressor
.
decompress_json
(
pdf_mid_data
),
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"
{
part_file_name
}
.json"
,
mode
=
MODE_TXT
,
)
@
click
.
group
()
@
click
.
group
()
def
cli
():
def
cli
():
pass
pass
...
@@ -96,11 +111,20 @@ def json_command(json, method):
...
@@ -96,11 +111,20 @@ def json_command(json, method):
jso
=
json_parse
.
loads
(
read_s3_path
(
json
).
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_s3_path
(
json
).
decode
(
"utf-8"
))
pdf_data
=
read_s3_path
(
jso
[
"file_location"
])
pdf_data
=
read_s3_path
(
jso
[
"file_location"
])
local_image_dir
,
_
=
prepare_env
()
local_image_dir
,
local_md_dir
=
prepare_env
()
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
_do_parse
(
parse
=
get_pdf_parse_method
(
method
)
pdf_data
,
parse
(
pdf_data
,
jso
[
"doc_layout_result"
],
local_image_rw
,
is_debug
=
True
)
jso
[
"doc_layout_result"
],
method
,
local_image_rw
,
local_md_rw
,
local_image_dir
,
)
@
cli
.
command
()
@
cli
.
command
()
...
@@ -128,15 +152,22 @@ def pdf_command(pdf, model, method):
...
@@ -128,15 +152,22 @@ def pdf_command(pdf, model, method):
pdf_data
=
read_fn
(
pdf
)
pdf_data
=
read_fn
(
pdf
)
jso
=
json_parse
.
loads
(
read_fn
(
model
).
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_fn
(
model
).
decode
(
"utf-8"
))
local_image_dir
,
local_md_dir
=
prepare_env
()
local_image_dir
,
_
=
prepare_env
()
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_image_rw
=
DiskReaderWriter
(
local_image_dir
)
local_md_dir
parse
=
get_pdf_parse_method
(
method
)
)
parse
(
pdf_data
,
jso
,
local_image_rw
,
is_debug
=
True
)
_do_parse
(
pdf_data
,
jso
[
"doc_layout_result"
],
method
,
local_image_rw
,
local_md_rw
,
local_image_dir
,
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
"""
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/
format
/v0
7
0/part-660
28dd46437
-0000
76
.jsonl?bytes=0,
308393
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/
manual
/v00
1
/part-660
407a28beb
-0000
02
.jsonl?bytes=0,
63551
"""
"""
cli
()
cli
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment