Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
c300c92b
Unverified
Commit
c300c92b
authored
Apr 22, 2024
by
myhloli
Committed by
GitHub
Apr 22, 2024
Browse files
Merge pull request #47 from myhloli/master
更新mm markdown拼装函数
parents
c23883b6
52777b22
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
92 additions
and
14 deletions
+92
-14
magic_pdf/cli/magicpdf.py
magic_pdf/cli/magicpdf.py
+9
-7
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+81
-4
magic_pdf/pdf_parse_by_ocr_v2.py
magic_pdf/pdf_parse_by_ocr_v2.py
+0
-1
magic_pdf/pre_proc/construct_page_dict.py
magic_pdf/pre_proc/construct_page_dict.py
+2
-2
No files found.
magic_pdf/cli/magicpdf.py
View file @
c300c92b
...
@@ -25,6 +25,8 @@ import os
...
@@ -25,6 +25,8 @@ import os
import
json
as
json_parse
import
json
as
json_parse
from
datetime
import
datetime
from
datetime
import
datetime
import
click
import
click
from
loguru
import
logger
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
...
@@ -77,13 +79,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
...
@@ -77,13 +79,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
path
=
f
"
{
part_file_name
}
.json"
,
path
=
f
"
{
part_file_name
}
.json"
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
)
try
:
#
try:
content_list
=
pipe
.
pipe_mk_uni_format
()
#
content_list = pipe.pipe_mk_uni_format()
except
Exception
as
e
:
#
except Exception as e:
print
(
e
)
#
logger.exception
(e)
md_writer
.
write
(
#
md_writer.write(
str
(
content_list
),
f
"
{
part_file_name
}
.txt"
,
AbsReaderWriter
.
MODE_TXT
#
str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
)
#
)
@
click
.
group
()
@
click
.
group
()
...
...
magic_pdf/dict2md/ocr_mkcontent.py
View file @
c300c92b
...
@@ -3,7 +3,7 @@ from loguru import logger
...
@@ -3,7 +3,7 @@ from loguru import logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
import
wordninja
import
wordninja
import
re
import
re
...
@@ -23,7 +23,7 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
...
@@ -23,7 +23,7 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
markdown
=
[]
markdown
=
[]
for
page_info
in
pdf_info_list
:
for
page_info
in
pdf_info_list
:
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
"mm"
,
img_buket_path
)
page_markdown
=
ocr_mk_markdown_with_para_core
_v2
(
paras_of_layout
,
"mm"
,
img_buket_path
)
markdown
.
extend
(
page_markdown
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
return
'
\n\n
'
.
join
(
markdown
)
...
@@ -32,10 +32,11 @@ def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
...
@@ -32,10 +32,11 @@ def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
markdown
=
[]
markdown
=
[]
for
page_info
in
pdf_info_dict
:
for
page_info
in
pdf_info_dict
:
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
page_markdown
=
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
"nlp"
)
page_markdown
=
ocr_mk_markdown_with_para_core
_v2
(
paras_of_layout
,
"nlp"
)
markdown
.
extend
(
page_markdown
)
markdown
.
extend
(
page_markdown
)
return
'
\n\n
'
.
join
(
markdown
)
return
'
\n\n
'
.
join
(
markdown
)
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
list
,
img_buket_path
):
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
list
,
img_buket_path
):
markdown_with_para_and_pagination
=
[]
markdown_with_para_and_pagination
=
[]
page_no
=
0
page_no
=
0
...
@@ -43,7 +44,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_p
...
@@ -43,7 +44,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_p
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
paras_of_layout
=
page_info
.
get
(
"para_blocks"
)
if
not
paras_of_layout
:
if
not
paras_of_layout
:
continue
continue
page_markdown
=
ocr_mk_markdown_with_para_core
(
paras_of_layout
,
"mm"
,
img_buket_path
)
page_markdown
=
ocr_mk_markdown_with_para_core
_v2
(
paras_of_layout
,
"mm"
,
img_buket_path
)
markdown_with_para_and_pagination
.
append
({
markdown_with_para_and_pagination
.
append
({
'page_no'
:
page_no
,
'page_no'
:
page_no
,
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
...
@@ -90,6 +91,81 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
...
@@ -90,6 +91,81 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
return
page_markdown
return
page_markdown
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
mode
,
img_buket_path
=
""
):
page_markdown
=
[]
for
para_block
in
paras_of_layout
:
para_type
=
para_block
.
get
(
'type'
)
if
para_type
==
BlockType
.
Text
:
para_text
=
merge_para_with_text
(
para_block
)
elif
para_type
==
BlockType
.
Title
:
para_text
=
f
"#
{
merge_para_with_text
(
para_block
)
}
"
elif
para_type
==
BlockType
.
InterlineEquation
:
para_text
=
merge_para_with_text
(
para_block
)
elif
para_type
==
BlockType
.
Image
:
if
mode
==
'nlp'
:
continue
elif
mode
==
'mm'
:
img_blocks
=
para_block
.
get
(
'blocks'
)
for
img_block
in
img_blocks
:
if
img_block
.
get
(
'type'
)
==
BlockType
.
ImageBody
:
for
line
in
img_block
.
get
(
'lines'
):
for
span
in
line
[
'spans'
]:
if
span
.
get
(
'type'
)
==
ContentType
.
Image
:
para_text
=
f
"
\n

}
)
\n
"
for
img_block
in
img_blocks
:
if
img_block
.
get
(
'type'
)
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
img_block
)
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
continue
elif
mode
==
'mm'
:
table_blocks
=
para_block
.
get
(
'blocks'
)
for
table_block
in
table_blocks
:
if
table_block
.
get
(
'type'
)
==
BlockType
.
TableBody
:
for
line
in
table_block
.
get
(
'lines'
):
for
span
in
line
[
'spans'
]:
if
span
.
get
(
'type'
)
==
ContentType
.
Table
:
para_text
=
f
"
\n

}
)
\n
"
for
table_block
in
table_blocks
:
if
table_block
.
get
(
'type'
)
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
table_block
)
elif
table_block
.
get
(
'type'
)
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
table_block
)
if
para_text
.
strip
()
==
''
:
continue
else
:
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
return
page_markdown
def
merge_para_with_text
(
para
):
para_text
=
''
for
line
in
para
[
'lines'
]:
for
span
in
line
[
'spans'
]:
span_type
=
span
.
get
(
'type'
)
content
=
''
language
=
''
if
span_type
==
ContentType
.
Text
:
content
=
span
[
'content'
]
language
=
detect_lang
(
content
)
if
language
==
'en'
:
# 只对英文长词进行分词处理,中文分词会丢失文本
content
=
ocr_escape_special_markdown_char
(
split_long_words
(
content
))
else
:
content
=
ocr_escape_special_markdown_char
(
content
)
elif
span_type
==
ContentType
.
InlineEquation
:
content
=
f
"$
{
span
[
'content'
]
}
$"
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{
span
[
'content'
]
}
\n
$$
\n
"
if
content
!=
''
:
if
language
==
'en'
:
# 英文语境下 content间需要空格分隔
para_text
+=
content
+
' '
else
:
# 中文语境下,content间不需要空格分隔
para_text
+=
content
return
para_text
def
para_to_standard_format
(
para
,
img_buket_path
):
def
para_to_standard_format
(
para
,
img_buket_path
):
para_content
=
{}
para_content
=
{}
if
len
(
para
)
==
1
:
if
len
(
para
)
==
1
:
...
@@ -124,6 +200,7 @@ def para_to_standard_format(para, img_buket_path):
...
@@ -124,6 +200,7 @@ def para_to_standard_format(para, img_buket_path):
}
}
return
para_content
return
para_content
def
make_standard_format_with_para
(
pdf_info_dict
:
list
,
img_buket_path
:
str
):
def
make_standard_format_with_para
(
pdf_info_dict
:
list
,
img_buket_path
:
str
):
content_list
=
[]
content_list
=
[]
for
page_info
in
pdf_info_dict
:
for
page_info
in
pdf_info_dict
:
...
...
magic_pdf/pdf_parse_by_ocr_v2.py
View file @
c300c92b
...
@@ -92,7 +92,6 @@ def parse_pdf_by_ocr(pdf_bytes,
...
@@ -92,7 +92,6 @@ def parse_pdf_by_ocr(pdf_bytes,
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
"""分段"""
"""分段"""
# if debug_mode:
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
"""dict转list"""
"""dict转list"""
...
...
magic_pdf/pre_proc/construct_page_dict.py
View file @
c300c92b
...
@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
...
@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
def
ocr_construct_page_component_v2
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
def
ocr_construct_page_component_v2
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
d
rop
ed_blocks
):
images
,
tables
,
interline_equations
,
d
iscard
ed_blocks
):
return_dict
=
{
return_dict
=
{
'preproc_blocks'
:
blocks
,
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'layout_bboxes'
:
layout_bboxes
,
...
@@ -65,6 +65,6 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
...
@@ -65,6 +65,6 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
'images'
:
images
,
'images'
:
images
,
'tables'
:
tables
,
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'interline_equations'
:
interline_equations
,
'd
rop
ed_blocks'
:
d
rop
ed_blocks
,
'd
iscard
ed_blocks'
:
d
iscard
ed_blocks
,
}
}
return
return_dict
return
return_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment