Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
a02a356e
Commit
a02a356e
authored
Apr 23, 2024
by
liukaiwen
Browse files
Merge branch 'master' of github.com:papayalove/Magic-PDF
parents
778b1fb7
7f27dd12
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
11 deletions
+17
-11
magic_pdf/cli/magicpdf.py
magic_pdf/cli/magicpdf.py
+16
-11
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+1
-0
No files found.
magic_pdf/cli/magicpdf.py
View file @
a02a356e
...
@@ -23,9 +23,9 @@ python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloa
...
@@ -23,9 +23,9 @@ python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloa
import
os
import
os
import
json
as
json_parse
import
json
as
json_parse
from
datetime
import
datetime
import
click
import
click
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
...
@@ -44,9 +44,9 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
...
@@ -44,9 +44,9 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
parse_pdf_methods
=
click
.
Choice
([
"ocr"
,
"txt"
,
"auto"
])
def
prepare_env
():
def
prepare_env
(
pdf_file_name
):
local_parent_dir
=
os
.
path
.
join
(
local_parent_dir
=
os
.
path
.
join
(
get_local_dir
(),
"magic-pdf"
,
datetime
.
now
().
strftime
(
"%Y-%m-%d"
)
get_local_dir
(),
"magic-pdf"
,
pdf_file_name
)
)
local_image_dir
=
os
.
path
.
join
(
local_parent_dir
,
"images"
)
local_image_dir
=
os
.
path
.
join
(
local_parent_dir
,
"images"
)
...
@@ -56,7 +56,7 @@ def prepare_env():
...
@@ -56,7 +56,7 @@ def prepare_env():
return
local_image_dir
,
local_md_dir
return
local_image_dir
,
local_md_dir
def
_do_parse
(
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
def
_do_parse
(
pdf_file_name
,
pdf_bytes
,
model_list
,
parse_method
,
image_writer
,
md_writer
,
image_dir
):
if
parse_method
==
"auto"
:
if
parse_method
==
"auto"
:
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
pipe
=
UNIPipe
(
pdf_bytes
,
model_list
,
image_writer
,
image_dir
,
is_debug
=
True
)
elif
parse_method
==
"txt"
:
elif
parse_method
==
"txt"
:
...
@@ -70,13 +70,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
...
@@ -70,13 +70,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
pipe
.
pipe_classify
()
pipe
.
pipe_classify
()
pipe
.
pipe_parse
()
pipe
.
pipe_parse
()
md_content
=
pipe
.
pipe_mk_markdown
()
md_content
=
pipe
.
pipe_mk_markdown
()
part_file_name
=
datetime
.
now
().
strftime
(
"%H-%M-%S"
)
#
part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer
.
write
(
md_writer
.
write
(
content
=
md_content
,
path
=
f
"
{
p
art
_file_name
}
.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
content
=
md_content
,
path
=
f
"
{
p
df
_file_name
}
.md"
,
mode
=
AbsReaderWriter
.
MODE_TXT
)
)
md_writer
.
write
(
md_writer
.
write
(
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
content
=
json_parse
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
path
=
f
"
{
p
art
_file_name
}
.json"
,
path
=
f
"
{
p
df
_file_name
}
.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
)
# try:
# try:
...
@@ -127,14 +127,17 @@ def json_command(json, method):
...
@@ -127,14 +127,17 @@ def json_command(json, method):
)
)
jso
=
json_parse
.
loads
(
read_s3_path
(
json
).
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_s3_path
(
json
).
decode
(
"utf-8"
))
pdf_data
=
read_s3_path
(
jso
[
"file_location"
])
s3_file_path
=
jso
[
"file_location"
]
local_image_dir
,
local_md_dir
=
prepare_env
()
pdf_file_name
=
Path
(
s3_file_path
).
stem
pdf_data
=
read_s3_path
(
s3_file_path
)
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
local_md_dir
)
)
_do_parse
(
_do_parse
(
pdf_file_name
,
pdf_data
,
pdf_data
,
jso
[
"doc_layout_result"
],
jso
[
"doc_layout_result"
],
method
,
method
,
...
@@ -169,11 +172,13 @@ def pdf_command(pdf, model, method):
...
@@ -169,11 +172,13 @@ def pdf_command(pdf, model, method):
pdf_data
=
read_fn
(
pdf
)
pdf_data
=
read_fn
(
pdf
)
jso
=
json_parse
.
loads
(
read_fn
(
model
).
decode
(
"utf-8"
))
jso
=
json_parse
.
loads
(
read_fn
(
model
).
decode
(
"utf-8"
))
local_image_dir
,
local_md_dir
=
prepare_env
()
pdf_file_name
=
Path
(
pdf
).
stem
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
)
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_image_rw
,
local_md_rw
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
local_md_dir
)
)
_do_parse
(
_do_parse
(
pdf_file_name
,
pdf_data
,
pdf_data
,
jso
,
jso
,
method
,
method
,
...
...
magic_pdf/dict2md/ocr_mkcontent.py
View file @
a02a356e
...
@@ -94,6 +94,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
...
@@ -94,6 +94,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
mode
,
img_buket_path
=
""
):
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
mode
,
img_buket_path
=
""
):
page_markdown
=
[]
page_markdown
=
[]
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
para_text
=
''
para_type
=
para_block
.
get
(
'type'
)
para_type
=
para_block
.
get
(
'type'
)
if
para_type
==
BlockType
.
Text
:
if
para_type
==
BlockType
.
Text
:
para_text
=
merge_para_with_text
(
para_block
)
para_text
=
merge_para_with_text
(
para_block
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment