Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d5dbed73
Commit
d5dbed73
authored
Mar 01, 2024
by
赵小蒙
Browse files
目录重构
parent
7c7910e4
Changes
85
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
55 additions
and
55 deletions
+55
-55
magic_pdf/para/title_processor.py
magic_pdf/para/title_processor.py
+2
-2
magic_pdf/pdf_parse_by_model.py
magic_pdf/pdf_parse_by_model.py
+33
-33
magic_pdf/post_proc/__init__.py
magic_pdf/post_proc/__init__.py
+0
-0
magic_pdf/post_proc/detect_para.py
magic_pdf/post_proc/detect_para.py
+2
-2
magic_pdf/post_proc/pdf_post_filter.py
magic_pdf/post_proc/pdf_post_filter.py
+2
-2
magic_pdf/post_proc/remove_footnote.py
magic_pdf/post_proc/remove_footnote.py
+1
-1
magic_pdf/pre_proc/__init__.py
magic_pdf/pre_proc/__init__.py
+0
-0
magic_pdf/pre_proc/citationmarker_remove.py
magic_pdf/pre_proc/citationmarker_remove.py
+1
-1
magic_pdf/pre_proc/construct_paras.py
magic_pdf/pre_proc/construct_paras.py
+0
-0
magic_pdf/pre_proc/detect_equation.py
magic_pdf/pre_proc/detect_equation.py
+2
-2
magic_pdf/pre_proc/detect_footer_by_model.py
magic_pdf/pre_proc/detect_footer_by_model.py
+1
-1
magic_pdf/pre_proc/detect_footer_header_by_statistics.py
magic_pdf/pre_proc/detect_footer_header_by_statistics.py
+1
-1
magic_pdf/pre_proc/detect_footnote.py
magic_pdf/pre_proc/detect_footnote.py
+1
-1
magic_pdf/pre_proc/detect_header.py
magic_pdf/pre_proc/detect_header.py
+1
-1
magic_pdf/pre_proc/detect_images.py
magic_pdf/pre_proc/detect_images.py
+1
-1
magic_pdf/pre_proc/detect_page_number.py
magic_pdf/pre_proc/detect_page_number.py
+1
-1
magic_pdf/pre_proc/detect_tables.py
magic_pdf/pre_proc/detect_tables.py
+1
-1
magic_pdf/pre_proc/equations_replace.py
magic_pdf/pre_proc/equations_replace.py
+1
-1
magic_pdf/pre_proc/fix_image.py
magic_pdf/pre_proc/fix_image.py
+2
-2
magic_pdf/pre_proc/fix_table.py
magic_pdf/pre_proc/fix_table.py
+2
-2
No files found.
pdf_tools
/para/title_processor.py
→
magic_pdf
/para/title_processor.py
View file @
d5dbed73
...
...
@@ -2,9 +2,9 @@ import os
import
re
import
numpy
as
np
from
pdf_tools
.libs.nlp_utils
import
NLPModels
from
magic_pdf
.libs.nlp_utils
import
NLPModels
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
sys
.
stdout
.
reconfigure
(
encoding
=
"utf-8"
)
# type: ignore
...
...
pdf_tools/pipeline
/pdf_parse_by_model.py
→
magic_pdf
/pdf_parse_by_model.py
View file @
d5dbed73
...
...
@@ -2,28 +2,28 @@ import time
# from anyio import Path
from
pdf_tools
.libs.commons
import
fitz
,
get_delta_time
,
get_img_s3_client
from
magic_pdf
.libs.commons
import
fitz
,
get_delta_time
,
get_img_s3_client
import
json
import
os
import
math
from
loguru
import
logger
from
pdf_tools
.layout.bbox_sort
import
(
from
magic_pdf
.layout.bbox_sort
import
(
prepare_bboxes_for_layout_split
,
)
from
pdf_tools
.layout.layout_sort
import
LAYOUT_UNPROC
,
get_bboxes_layout
,
get_columns_cnt_of_layout
,
sort_text_block
from
pdf_tools
.libs.drop_reason
import
DropReason
from
pdf_tools
.libs.markdown_utils
import
escape_special_markdown_char
from
pdf_tools
.libs.safe_filename
import
sanitize_filename
from
pdf_tools
.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
pdf_tools
.pre_proc.detect_images
import
parse_images
from
pdf_tools
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
pdf_tools
.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
pdf_tools
.pre_proc.detect_header
import
parse_headers
# 获取headers的bbox
from
pdf_tools
.pre_proc.detect_page_number
import
parse_pageNos
# 获取pageNos的bbox
from
pdf_tools
.pre_proc.detect_footnote
import
parse_footnotes_by_model
,
parse_footnotes_by_rule
# 获取footnotes的bbox
from
pdf_tools
.pre_proc.detect_footer_by_model
import
parse_footers
# 获取footers的bbox
from
pdf_tools
.post_proc.detect_para
import
(
from
magic_pdf
.layout.layout_sort
import
LAYOUT_UNPROC
,
get_bboxes_layout
,
get_columns_cnt_of_layout
,
sort_text_block
from
magic_pdf
.libs.drop_reason
import
DropReason
from
magic_pdf
.libs.markdown_utils
import
escape_special_markdown_char
from
magic_pdf
.libs.safe_filename
import
sanitize_filename
from
magic_pdf
.libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
magic_pdf
.pre_proc.detect_images
import
parse_images
from
magic_pdf
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf
.pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
magic_pdf
.pre_proc.detect_header
import
parse_headers
# 获取headers的bbox
from
magic_pdf
.pre_proc.detect_page_number
import
parse_pageNos
# 获取pageNos的bbox
from
magic_pdf
.pre_proc.detect_footnote
import
parse_footnotes_by_model
,
parse_footnotes_by_rule
# 获取footnotes的bbox
from
magic_pdf
.pre_proc.detect_footer_by_model
import
parse_footers
# 获取footers的bbox
from
magic_pdf
.post_proc.detect_para
import
(
ParaProcessPipeline
,
TitleDetectionException
,
TitleLevelException
,
...
...
@@ -31,9 +31,9 @@ from pdf_tools.post_proc.detect_para import (
ParaMergeException
,
DenseSingleLineBlockException
,
)
from
pdf_tools
.pre_proc.main_text_font
import
get_main_text_font
from
pdf_tools
.pre_proc.remove_colored_strip_bbox
import
remove_colored_strip_textblock
from
pdf_tools
.pre_proc.remove_footer_header
import
remove_headder_footer_one_page
from
magic_pdf
.pre_proc.main_text_font
import
get_main_text_font
from
magic_pdf
.pre_proc.remove_colored_strip_bbox
import
remove_colored_strip_textblock
from
magic_pdf
.pre_proc.remove_footer_header
import
remove_headder_footer_one_page
'''
from para.para_pipeline import ParaProcessPipeline
...
...
@@ -46,19 +46,19 @@ from para.exceptions import (
)
'''
from
pdf_tools
.libs.commons
import
read_file
,
join_path
from
pdf_tools
.libs.pdf_image_tools
import
save_images_by_bboxes
from
pdf_tools
.post_proc.remove_footnote
import
merge_footnote_blocks
,
remove_footnote_blocks
from
pdf_tools
.pre_proc.citationmarker_remove
import
remove_citation_marker
from
pdf_tools
.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
pdf_tools
.pre_proc.pdf_pre_filter
import
pdf_filter
from
pdf_tools
.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
pdf_tools
.pre_proc.construct_paras
import
construct_page_component
from
pdf_tools
.pre_proc.fix_image
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
pdf_tools
.post_proc.pdf_post_filter
import
pdf_post_filter
from
pdf_tools
.pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
pdf_tools
.pre_proc.resolve_bbox_conflict
import
check_text_block_horizontal_overlap
,
resolve_bbox_overlap_conflict
from
pdf_tools
.pre_proc.fix_table
import
fix_table_text_block
,
fix_tables
,
include_table_title
from
magic_pdf
.libs.commons
import
read_file
,
join_path
from
magic_pdf
.libs.pdf_image_tools
import
save_images_by_bboxes
from
magic_pdf
.post_proc.remove_footnote
import
merge_footnote_blocks
,
remove_footnote_blocks
from
magic_pdf
.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf
.pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
magic_pdf
.pre_proc.pdf_pre_filter
import
pdf_filter
from
magic_pdf
.pre_proc.detect_footer_header_by_statistics
import
drop_footer_header
from
magic_pdf
.pre_proc.construct_paras
import
construct_page_component
from
magic_pdf
.pre_proc.fix_image
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
magic_pdf
.post_proc.pdf_post_filter
import
pdf_post_filter
from
magic_pdf
.pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
magic_pdf
.pre_proc.resolve_bbox_conflict
import
check_text_block_horizontal_overlap
,
resolve_bbox_overlap_conflict
from
magic_pdf
.pre_proc.fix_table
import
fix_table_text_block
,
fix_tables
,
include_table_title
denseSingleLineBlockException_msg
=
DenseSingleLineBlockException
().
message
titleDetectionException_msg
=
TitleDetectionException
().
message
...
...
@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode
=
False
,
):
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../..
/..
"
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../.."
,
"tmp"
,
"unittest"
)
md_bookname_save_path
=
""
book_name
=
sanitize_filename
(
book_name
)
if
debug_mode
:
...
...
pdf_tools/pipeline
/__init__.py
→
magic_pdf/post_proc
/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/post_proc/detect_para.py
→
magic_pdf
/post_proc/detect_para.py
View file @
d5dbed73
...
...
@@ -11,8 +11,8 @@ import numpy as np
from
termcolor
import
cprint
from
pdf_tools
.libs.commons
import
fitz
from
pdf_tools
.libs.nlp_utils
import
NLPModels
from
magic_pdf
.libs.commons
import
fitz
from
magic_pdf
.libs.nlp_utils
import
NLPModels
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/post_proc/pdf_post_filter.py
→
magic_pdf
/post_proc/pdf_post_filter.py
View file @
d5dbed73
from
loguru
import
logger
from
pdf_tools
.layout.layout_sort
import
get_columns_cnt_of_layout
from
pdf_tools
.libs.drop_reason
import
DropReason
from
magic_pdf
.layout.layout_sort
import
get_columns_cnt_of_layout
from
magic_pdf
.libs.drop_reason
import
DropReason
def
__is_pseudo_single_column
(
page_info
)
->
bool
:
...
...
pdf_tools
/post_proc/remove_footnote.py
→
magic_pdf
/post_proc/remove_footnote.py
View file @
d5dbed73
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
import
collections
# 统计库
...
...
pdf_tools/post
_proc/__init__.py
→
magic_pdf/pre
_proc/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/citationmarker_remove.py
→
magic_pdf
/pre_proc/citationmarker_remove.py
View file @
d5dbed73
...
...
@@ -3,7 +3,7 @@
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
"""
import
re
from
pdf_tools
.libs.nlp_utils
import
NLPModels
from
magic_pdf
.libs.nlp_utils
import
NLPModels
__NLP_MODEL
=
NLPModels
()
...
...
pdf_tools
/pre_proc/construct_paras.py
→
magic_pdf
/pre_proc/construct_paras.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/detect_equation.py
→
magic_pdf
/pre_proc/detect_equation.py
View file @
d5dbed73
from
pdf_tools
.libs.boxbase
import
_is_in
# 正则
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.boxbase
import
_is_in
# 正则
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
__solve_contain_bboxs
(
all_bbox_list
:
list
):
...
...
pdf_tools
/pre_proc/detect_footer_by_model.py
→
magic_pdf
/pre_proc/detect_footer_by_model.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_footers
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/detect_footer_header_by_statistics.py
→
magic_pdf
/pre_proc/detect_footer_header_by_statistics.py
View file @
d5dbed73
from
collections
import
defaultdict
from
pdf_tools
.libs.boxbase
import
calculate_iou
from
magic_pdf
.libs.boxbase
import
calculate_iou
def
compare_bbox_with_list
(
bbox
,
bbox_list
,
tolerance
=
1
):
...
...
pdf_tools
/pre_proc/detect_footnote.py
→
magic_pdf
/pre_proc/detect_footnote.py
View file @
d5dbed73
from
collections
import
Counter
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
,
debug_mode
=
False
):
...
...
pdf_tools
/pre_proc/detect_header.py
→
magic_pdf
/pre_proc/detect_header.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_headers
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/detect_images.py
→
magic_pdf
/pre_proc/detect_images.py
View file @
d5dbed73
import
collections
# 统计库
import
re
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
#--------------------------------------- Tool Functions --------------------------------------#
...
...
pdf_tools
/pre_proc/detect_page_number.py
→
magic_pdf
/pre_proc/detect_page_number.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_pageNos
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/detect_tables.py
→
magic_pdf
/pre_proc/detect_tables.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_tables
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pdf_tools
/pre_proc/equations_replace.py
→
magic_pdf
/pre_proc/equations_replace.py
View file @
d5dbed73
"""
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
"""
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
import
json
import
os
from
pathlib
import
Path
...
...
pdf_tools
/pre_proc/fix_image.py
→
magic_pdf
/pre_proc/fix_image.py
View file @
d5dbed73
...
...
@@ -2,9 +2,9 @@
import
re
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
from
pdf_tools
.libs.textbase
import
get_text_block_base_info
from
magic_pdf
.libs.textbase
import
get_text_block_base_info
def
fix_image_vertical
(
image_bboxes
:
list
,
text_blocks
:
list
):
"""
...
...
pdf_tools
/pre_proc/fix_table.py
→
magic_pdf
/pre_proc/fix_table.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
import
re
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
# json
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
# json
## version 2
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment