Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
9d689790
Unverified
Commit
9d689790
authored
Sep 19, 2024
by
linfeng
Committed by
GitHub
Sep 19, 2024
Browse files
Merge branch 'opendatalab:dev' into dev
parents
bcef0868
fb383ba6
Changes
59
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
149 additions
and
54 deletions
+149
-54
magic_pdf/pipe/AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+8
-3
magic_pdf/pipe/OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+6
-4
magic_pdf/pipe/TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+6
-4
magic_pdf/pipe/UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+11
-7
magic_pdf/resources/model_config/UniMERNet/demo.yaml
magic_pdf/resources/model_config/UniMERNet/demo.yaml
+7
-7
magic_pdf/resources/model_config/model_configs.yaml
magic_pdf/resources/model_config/model_configs.yaml
+1
-1
magic_pdf/tools/cli.py
magic_pdf/tools/cli.py
+14
-1
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+5
-4
magic_pdf/user_api.py
magic_pdf/user_api.py
+16
-5
projects/README.md
projects/README.md
+2
-0
projects/README_zh-CN.md
projects/README_zh-CN.md
+2
-0
projects/gradio_app/README.md
projects/gradio_app/README.md
+24
-0
projects/gradio_app/README_zh-CN.md
projects/gradio_app/README_zh-CN.md
+24
-0
projects/gradio_app/app.py
projects/gradio_app/app.py
+23
-18
projects/gradio_app/examples/academic_paper_formula.pdf
projects/gradio_app/examples/academic_paper_formula.pdf
+0
-0
projects/gradio_app/examples/academic_paper_img_formula.pdf
projects/gradio_app/examples/academic_paper_img_formula.pdf
+0
-0
projects/gradio_app/examples/garbled_formula.pdf
projects/gradio_app/examples/garbled_formula.pdf
+0
-0
projects/gradio_app/examples/garbled_formula2.pdf
projects/gradio_app/examples/garbled_formula2.pdf
+0
-0
projects/gradio_app/examples/garbled_img_formula.pdf
projects/gradio_app/examples/garbled_img_formula.pdf
+0
-0
projects/gradio_app/examples/scanned.pdf
projects/gradio_app/examples/scanned.pdf
+0
-0
No files found.
magic_pdf/pipe/AbsPipe.py
View file @
9d689790
...
@@ -17,7 +17,7 @@ class AbsPipe(ABC):
...
@@ -17,7 +17,7 @@ class AbsPipe(ABC):
PIP_TXT
=
"txt"
PIP_TXT
=
"txt"
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
self
.
pdf_bytes
=
pdf_bytes
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
self
.
image_writer
=
image_writer
...
@@ -25,6 +25,7 @@ class AbsPipe(ABC):
...
@@ -25,6 +25,7 @@ class AbsPipe(ABC):
self
.
is_debug
=
is_debug
self
.
is_debug
=
is_debug
self
.
start_page_id
=
start_page_id
self
.
start_page_id
=
start_page_id
self
.
end_page_id
=
end_page_id
self
.
end_page_id
=
end_page_id
self
.
lang
=
lang
def
get_compress_pdf_mid_data
(
self
):
def
get_compress_pdf_mid_data
(
self
):
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
...
@@ -94,7 +95,9 @@ class AbsPipe(ABC):
...
@@ -94,7 +95,9 @@ class AbsPipe(ABC):
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
return
content_list
return
content_list
@
staticmethod
@
staticmethod
...
@@ -104,7 +107,9 @@ class AbsPipe(ABC):
...
@@ -104,7 +107,9 @@ class AbsPipe(ABC):
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
)
parse_type
=
pdf_mid_data
[
"_parse_type"
]
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
return
md_content
return
md_content
magic_pdf/pipe/OCRPipe.py
View file @
9d689790
...
@@ -10,19 +10,21 @@ from magic_pdf.user_api import parse_ocr_pdf
...
@@ -10,19 +10,21 @@ from magic_pdf.user_api import parse_ocr_pdf
class
OCRPipe
(
AbsPipe
):
class
OCRPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
def
pipe_analyze
(
self
):
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
...
...
magic_pdf/pipe/TXTPipe.py
View file @
9d689790
...
@@ -11,19 +11,21 @@ from magic_pdf.user_api import parse_txt_pdf
...
@@ -11,19 +11,21 @@ from magic_pdf.user_api import parse_txt_pdf
class
TXTPipe
(
AbsPipe
):
class
TXTPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
def
pipe_classify
(
self
):
def
pipe_classify
(
self
):
pass
pass
def
pipe_analyze
(
self
):
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
...
...
magic_pdf/pipe/UNIPipe.py
View file @
9d689790
...
@@ -14,9 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
...
@@ -14,9 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
jso_useful_key
:
dict
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
def
__init__
(
self
,
pdf_bytes
:
bytes
,
jso_useful_key
:
dict
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
super
().
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
)
super
().
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
if
len
(
self
.
model_list
)
==
0
:
if
len
(
self
.
model_list
)
==
0
:
self
.
input_model_is_empty
=
True
self
.
input_model_is_empty
=
True
else
:
else
:
...
@@ -28,22 +28,26 @@ class UNIPipe(AbsPipe):
...
@@ -28,22 +28,26 @@ class UNIPipe(AbsPipe):
def
pipe_analyze
(
self
):
def
pipe_analyze
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_parse
(
self
):
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
input_model_is_empty
=
self
.
input_model_is_empty
,
is_debug
=
self
.
is_debug
,
input_model_is_empty
=
self
.
input_model_is_empty
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
)
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
NONE_WITH_REASON
):
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
logger
.
info
(
"uni_pipe mk content list finished"
)
logger
.
info
(
"uni_pipe mk content list finished"
)
return
result
return
result
...
...
magic_pdf/resources/model_config/UniMERNet/demo.yaml
View file @
9d689790
...
@@ -2,13 +2,13 @@ model:
...
@@ -2,13 +2,13 @@ model:
arch
:
unimernet
arch
:
unimernet
model_type
:
unimernet
model_type
:
unimernet
model_config
:
model_config
:
model_name
:
./models
model_name
:
./models
/unimernet_base
max_seq_len
:
1
024
max_seq_len
:
1
536
length_aware
:
False
load_pretrained
:
True
load_pretrained
:
True
pretrained
:
./models/pytorch_model.
bin
pretrained
:
'
./models/
unimernet_base/
pytorch_model.
pth'
tokenizer_config
:
tokenizer_config
:
path
:
./models
path
:
./models
/unimernet_base
datasets
:
datasets
:
formula_rec_eval
:
formula_rec_eval
:
...
@@ -18,7 +18,7 @@ datasets:
...
@@ -18,7 +18,7 @@ datasets:
image_size
:
image_size
:
-
192
-
192
-
672
-
672
run
:
run
:
runner
:
runner_iter
runner
:
runner_iter
task
:
unimernet_train
task
:
unimernet_train
...
@@ -43,4 +43,4 @@ run:
...
@@ -43,4 +43,4 @@ run:
distributed_type
:
ddp
# or fsdp when train llm
distributed_type
:
ddp
# or fsdp when train llm
generate_cfg
:
generate_cfg
:
temperature
:
0.0
temperature
:
0.0
\ No newline at end of file
magic_pdf/resources/model_config/model_configs.yaml
View file @
9d689790
...
@@ -10,6 +10,6 @@ config:
...
@@ -10,6 +10,6 @@ config:
weights
:
weights
:
layout
:
Layout/model_final.pth
layout
:
Layout/model_final.pth
mfd
:
MFD/weights.pt
mfd
:
MFD/weights.pt
mfr
:
MFR/
U
ni
MERNet
mfr
:
MFR/
u
ni
mernet_base
struct_eqtable
:
TabRec/StructEqTable
struct_eqtable
:
TabRec/StructEqTable
TableMaster
:
TabRec/TableMaster
TableMaster
:
TabRec/TableMaster
\ No newline at end of file
magic_pdf/tools/cli.py
View file @
9d689790
...
@@ -44,6 +44,18 @@ auto: automatically choose the best method for parsing pdf from ocr and txt.
...
@@ -44,6 +44,18 @@ auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default."""
,
without method specified, auto will be used by default."""
,
default
=
'auto'
,
default
=
'auto'
,
)
)
@
click
.
option
(
'-l'
,
'--lang'
,
'lang'
,
type
=
str
,
help
=
"""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
"""
,
default
=
None
,
)
@
click
.
option
(
@
click
.
option
(
'-d'
,
'-d'
,
'--debug'
,
'--debug'
,
...
@@ -68,7 +80,7 @@ without method specified, auto will be used by default.""",
...
@@ -68,7 +80,7 @@ without method specified, auto will be used by default.""",
help
=
'The ending page for PDF parsing, beginning from 0.'
,
help
=
'The ending page for PDF parsing, beginning from 0.'
,
default
=
None
,
default
=
None
,
)
)
def
cli
(
path
,
output_dir
,
method
,
debug_able
,
start_page_id
,
end_page_id
):
def
cli
(
path
,
output_dir
,
method
,
lang
,
debug_able
,
start_page_id
,
end_page_id
):
model_config
.
__use_inside_model__
=
True
model_config
.
__use_inside_model__
=
True
model_config
.
__model_mode__
=
'full'
model_config
.
__model_mode__
=
'full'
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
...
@@ -90,6 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
...
@@ -90,6 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
debug_able
,
debug_able
,
start_page_id
=
start_page_id
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
)
except
Exception
as
e
:
except
Exception
as
e
:
...
...
magic_pdf/tools/common.py
View file @
9d689790
...
@@ -44,9 +44,10 @@ def do_parse(
...
@@ -44,9 +44,10 @@ def do_parse(
f_draw_model_bbox
=
False
,
f_draw_model_bbox
=
False
,
start_page_id
=
0
,
start_page_id
=
0
,
end_page_id
=
None
,
end_page_id
=
None
,
lang
=
None
,
):
):
if
debug_able
:
if
debug_able
:
logger
.
warning
(
"
debug mode is on
"
)
logger
.
warning
(
'
debug mode is on
'
)
f_dump_content_list
=
True
f_dump_content_list
=
True
f_draw_model_bbox
=
True
f_draw_model_bbox
=
True
...
@@ -61,13 +62,13 @@ def do_parse(
...
@@ -61,13 +62,13 @@ def do_parse(
if
parse_method
==
'auto'
:
if
parse_method
==
'auto'
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_list
}
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_list
}
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
,
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
elif
parse_method
==
'txt'
:
elif
parse_method
==
'txt'
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
elif
parse_method
==
'ocr'
:
elif
parse_method
==
'ocr'
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
else
:
else
:
logger
.
error
(
'unknown parse method'
)
logger
.
error
(
'unknown parse method'
)
exit
(
1
)
exit
(
1
)
...
...
magic_pdf/user_api.py
View file @
9d689790
...
@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr"
...
@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr"
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
def
parse_txt_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
*
args
,
**
kwargs
):
*
args
,
**
kwargs
):
"""
"""
解析文本类pdf
解析文本类pdf
...
@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
...
@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict
[
"_version_name"
]
=
__version__
pdf_info_dict
[
"_version_name"
]
=
__version__
if
lang
is
not
None
:
pdf_info_dict
[
"_lang"
]
=
lang
return
pdf_info_dict
return
pdf_info_dict
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
def
parse_ocr_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
*
args
,
**
kwargs
):
*
args
,
**
kwargs
):
"""
"""
解析ocr类pdf
解析ocr类pdf
...
@@ -66,12 +69,15 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
...
@@ -66,12 +69,15 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict
[
"_version_name"
]
=
__version__
pdf_info_dict
[
"_version_name"
]
=
__version__
if
lang
is
not
None
:
pdf_info_dict
[
"_lang"
]
=
lang
return
pdf_info_dict
return
pdf_info_dict
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
def
parse_union_pdf
(
pdf_bytes
:
bytes
,
pdf_models
:
list
,
imageWriter
:
AbsReaderWriter
,
is_debug
=
False
,
input_model_is_empty
:
bool
=
False
,
input_model_is_empty
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
*
args
,
**
kwargs
):
*
args
,
**
kwargs
):
"""
"""
ocr和文本混合的pdf,全部解析出来
ocr和文本混合的pdf,全部解析出来
...
@@ -95,9 +101,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
...
@@ -95,9 +101,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
if
input_model_is_empty
:
if
input_model_is_empty
:
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
start_page_id
=
start_page_id
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
end_page_id
=
end_page_id
,
lang
=
lang
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
...
@@ -108,4 +116,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
...
@@ -108,4 +116,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
pdf_info_dict
[
"_version_name"
]
=
__version__
pdf_info_dict
[
"_version_name"
]
=
__version__
if
lang
is
not
None
:
pdf_info_dict
[
"_lang"
]
=
lang
return
pdf_info_dict
return
pdf_info_dict
projects/README.md
View file @
9d689790
...
@@ -3,4 +3,6 @@
...
@@ -3,4 +3,6 @@
## Project List
## Project List
-
[
llama_index_rag
](
./llama_index_rag/README.md
)
: Build a lightweight RAG system based on llama_index
-
[
llama_index_rag
](
./llama_index_rag/README.md
)
: Build a lightweight RAG system based on llama_index
-
[
gradio_app
](
./gradio_app/README.md
)
: Build a web app based on gradio
projects/README_zh-CN.md
View file @
9d689790
...
@@ -3,3 +3,5 @@
...
@@ -3,3 +3,5 @@
## 项目列表
## 项目列表
-
[
llama_index_rag
](
./llama_index_rag/README_zh-CN.md
)
: 基于 llama_index 构建轻量级 RAG 系统
-
[
llama_index_rag
](
./llama_index_rag/README_zh-CN.md
)
: 基于 llama_index 构建轻量级 RAG 系统
-
[
gradio_app
](
./gradio_app/README_zh-CN.md
)
: 基于 Gradio 的 Web 应用
projects/gradio_app/README.md
0 → 100644
View file @
9d689790
## Installation
MinerU(>=0.8.0)
> If you already have a functioning MinerU environment, you can skip this step.
>
[
Deploy in CPU environment
](
https://github.com/opendatalab/MinerU?tab=readme-ov-file#quick-cpu-demo
)
[
Deploy in GPU environment
](
https://github.com/opendatalab/MinerU?tab=readme-ov-file#using-gpu
)
Third-party Software
```
bash
pip
install
gradio gradio-pdf
```
## Start Gradio App
```
bash
python app.py
```
## Use Gradio App
Access http://127.0.0.1:7860 in your web browser
\ No newline at end of file
projects/gradio_app/README_zh-CN.md
0 → 100644
View file @
9d689790
## 安装
MinerU(>=0.8.0)
>如已有正常运行的MinerU环境则可以跳过此步骤
>
[
在CPU环境部署
](
https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8cpu%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C
)
[
在GPU环境部署
](
https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8gpu
)
第三方软件
```
bash
pip
install
gradio gradio-pdf
```
## 启动gradio应用
```
bash
python app.py
```
## 使用gradio应用
在浏览器中访问 http://127.0.0.1:7860
\ No newline at end of file
app.py
→
projects/gradio_app/
app.py
View file @
9d689790
...
@@ -14,8 +14,6 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
...
@@ -14,8 +14,6 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.tools.common
import
do_parse
,
prepare_env
from
magic_pdf.tools.common
import
do_parse
,
prepare_env
os
.
system
(
"pip install gradio"
)
os
.
system
(
"pip install gradio-pdf"
)
import
gradio
as
gr
import
gradio
as
gr
from
gradio_pdf
import
PDF
from
gradio_pdf
import
PDF
...
@@ -25,13 +23,16 @@ def read_fn(path):
...
@@ -25,13 +23,16 @@ def read_fn(path):
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
),
AbsReaderWriter
.
MODE_BIN
)
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
),
AbsReaderWriter
.
MODE_BIN
)
def
parse_pdf
(
doc_path
,
output_dir
,
end_page_id
):
def
parse_pdf
(
doc_path
,
output_dir
,
end_page_id
,
is_ocr
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
try
:
try
:
file_name
=
f
"
{
str
(
Path
(
doc_path
).
stem
)
}
_
{
time
.
time
()
}
"
file_name
=
f
"
{
str
(
Path
(
doc_path
).
stem
)
}
_
{
time
.
time
()
}
"
pdf_data
=
read_fn
(
doc_path
)
pdf_data
=
read_fn
(
doc_path
)
parse_method
=
"auto"
if
is_ocr
:
parse_method
=
"ocr"
else
:
parse_method
=
"auto"
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
file_name
,
parse_method
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
file_name
,
parse_method
)
do_parse
(
do_parse
(
output_dir
,
output_dir
,
...
@@ -92,9 +93,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
...
@@ -92,9 +93,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
return
re
.
sub
(
pattern
,
replace
,
markdown_text
)
return
re
.
sub
(
pattern
,
replace
,
markdown_text
)
def
to_markdown
(
file_path
,
end_pages
):
def
to_markdown
(
file_path
,
end_pages
,
is_ocr
):
# 获取识别的md文件以及压缩包文件路径
# 获取识别的md文件以及压缩包文件路径
local_md_dir
,
file_name
=
parse_pdf
(
file_path
,
'./output'
,
end_pages
-
1
)
local_md_dir
,
file_name
=
parse_pdf
(
file_path
,
'./output'
,
end_pages
-
1
,
is_ocr
)
archive_zip_path
=
os
.
path
.
join
(
"./output"
,
compute_sha256
(
local_md_dir
)
+
".zip"
)
archive_zip_path
=
os
.
path
.
join
(
"./output"
,
compute_sha256
(
local_md_dir
)
+
".zip"
)
zip_archive_success
=
compress_directory_to_zip
(
local_md_dir
,
archive_zip_path
)
zip_archive_success
=
compress_directory_to_zip
(
local_md_dir
,
archive_zip_path
)
if
zip_archive_success
==
0
:
if
zip_archive_success
==
0
:
...
@@ -111,14 +112,6 @@ def to_markdown(file_path, end_pages):
...
@@ -111,14 +112,6 @@ def to_markdown(file_path, end_pages):
return
md_content
,
txt_content
,
archive_zip_path
,
new_pdf_path
return
md_content
,
txt_content
,
archive_zip_path
,
new_pdf_path
# def show_pdf(file_path):
# with open(file_path, "rb") as f:
# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
# f'width="100%" height="1000" type="application/pdf">'
# return pdf_display
latex_delimiters
=
[{
"left"
:
"$$"
,
"right"
:
"$$"
,
"display"
:
True
},
latex_delimiters
=
[{
"left"
:
"$$"
,
"right"
:
"$$"
,
"display"
:
True
},
{
"left"
:
'$'
,
"right"
:
'$'
,
"display"
:
False
}]
{
"left"
:
'$'
,
"right"
:
'$'
,
"display"
:
False
}]
...
@@ -141,16 +134,29 @@ model_init = init_model()
...
@@ -141,16 +134,29 @@ model_init = init_model()
logger
.
info
(
f
"model_init:
{
model_init
}
"
)
logger
.
info
(
f
"model_init:
{
model_init
}
"
)
with
open
(
"header.html"
,
"r"
)
as
file
:
header
=
file
.
read
()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
with
gr
.
Blocks
()
as
demo
:
with
gr
.
Blocks
()
as
demo
:
gr
.
HTML
(
header
)
with
gr
.
Row
():
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
pdf_show
=
gr
.
Markdown
()
pdf_show
=
gr
.
Markdown
()
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
with
gr
.
Row
()
as
bu_flow
:
with
gr
.
Row
()
as
bu_flow
:
is_ocr
=
gr
.
Checkbox
(
label
=
"Force enable OCR"
)
change_bu
=
gr
.
Button
(
"Convert"
)
change_bu
=
gr
.
Button
(
"Convert"
)
clear_bu
=
gr
.
ClearButton
([
pdf_show
],
value
=
"Clear"
)
clear_bu
=
gr
.
ClearButton
([
pdf_show
],
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"Please upload pdf"
,
interactive
=
True
,
height
=
800
)
pdf_show
=
PDF
(
label
=
"Please upload pdf"
,
interactive
=
True
,
height
=
800
)
with
gr
.
Accordion
(
"Examples:"
):
example_root
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"examples"
)
gr
.
Examples
(
examples
=
[
os
.
path
.
join
(
example_root
,
_
)
for
_
in
os
.
listdir
(
example_root
)
if
_
.
endswith
(
"pdf"
)],
inputs
=
pdf_show
,
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
output_file
=
gr
.
File
(
label
=
"convert result"
,
interactive
=
False
)
output_file
=
gr
.
File
(
label
=
"convert result"
,
interactive
=
False
)
...
@@ -160,8 +166,7 @@ if __name__ == "__main__":
...
@@ -160,8 +166,7 @@ if __name__ == "__main__":
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
with
gr
.
Tab
(
"Markdown text"
):
with
gr
.
Tab
(
"Markdown text"
):
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
,
is_ocr
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
clear_bu
.
add
([
md
,
pdf_show
,
md_text
,
output_file
])
clear_bu
.
add
([
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
])
demo
.
launch
()
demo
.
launch
()
\ No newline at end of file
projects/gradio_app/examples/academic_paper_formula.pdf
0 → 100755
View file @
9d689790
File added
projects/gradio_app/examples/academic_paper_img_formula.pdf
0 → 100755
View file @
9d689790
File added
projects/gradio_app/examples/garbled_formula.pdf
0 → 100755
View file @
9d689790
File added
projects/gradio_app/examples/garbled_formula2.pdf
0 → 100755
View file @
9d689790
File added
projects/gradio_app/examples/garbled_img_formula.pdf
0 → 100755
View file @
9d689790
File added
projects/gradio_app/examples/scanned.pdf
0 → 100755
View file @
9d689790
File added
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment