Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f99149b8
Commit
f99149b8
authored
Mar 01, 2024
by
赵小蒙
Browse files
重构目录结构
parent
59bc15e0
Changes
133
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
21 additions
and
52 deletions
+21
-52
pdf_tools/pre_proc/detect_footer_by_model.py
pdf_tools/pre_proc/detect_footer_by_model.py
+1
-5
pdf_tools/pre_proc/detect_footer_header_by_statistics.py
pdf_tools/pre_proc/detect_footer_header_by_statistics.py
+1
-3
pdf_tools/pre_proc/detect_footnote.py
pdf_tools/pre_proc/detect_footnote.py
+1
-4
pdf_tools/pre_proc/detect_header.py
pdf_tools/pre_proc/detect_header.py
+1
-5
pdf_tools/pre_proc/detect_images.py
pdf_tools/pre_proc/detect_images.py
+1
-4
pdf_tools/pre_proc/detect_page_number.py
pdf_tools/pre_proc/detect_page_number.py
+1
-5
pdf_tools/pre_proc/detect_tables.py
pdf_tools/pre_proc/detect_tables.py
+1
-5
pdf_tools/pre_proc/equations_replace.py
pdf_tools/pre_proc/equations_replace.py
+1
-1
pdf_tools/pre_proc/fix_image.py
pdf_tools/pre_proc/fix_image.py
+2
-3
pdf_tools/pre_proc/fix_table.py
pdf_tools/pre_proc/fix_table.py
+2
-6
pdf_tools/pre_proc/main_text_font.py
pdf_tools/pre_proc/main_text_font.py
+0
-0
pdf_tools/pre_proc/pdf_pre_filter.py
pdf_tools/pre_proc/pdf_pre_filter.py
+3
-3
pdf_tools/pre_proc/post_layout_split.py
pdf_tools/pre_proc/post_layout_split.py
+0
-0
pdf_tools/pre_proc/remove_colored_strip_bbox.py
pdf_tools/pre_proc/remove_colored_strip_bbox.py
+2
-2
pdf_tools/pre_proc/remove_footer_header.py
pdf_tools/pre_proc/remove_footer_header.py
+1
-1
pdf_tools/pre_proc/remove_rotate_bbox.py
pdf_tools/pre_proc/remove_rotate_bbox.py
+2
-4
pdf_tools/pre_proc/resolve_bbox_conflict.py
pdf_tools/pre_proc/resolve_bbox_conflict.py
+1
-1
pdf_tools/pre_proc/statistics.py
pdf_tools/pre_proc/statistics.py
+0
-0
tests/assets/more_para_test_samples/gift_files.txt
tests/assets/more_para_test_samples/gift_files.txt
+0
-0
tests/assets/more_para_test_samples/scihub_files.txt
tests/assets/more_para_test_samples/scihub_files.txt
+0
-0
No files found.
pre_proc/detect_footer_by_model.py
→
pdf_tools/
pre_proc/detect_footer_by_model.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
def
parse_footers
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pre_proc/detect_footer_header_by_statistics.py
→
pdf_tools/
pre_proc/detect_footer_header_by_statistics.py
View file @
f99149b8
from
collections
import
defaultdict
from
loguru
import
logger
from
libs.boxbase
import
_is_in
,
calculate_iou
from
pdf_tools.libs.boxbase
import
calculate_iou
def
compare_bbox_with_list
(
bbox
,
bbox_list
,
tolerance
=
1
):
...
...
pre_proc/detect_footnote.py
→
pdf_tools/
pre_proc/detect_footnote.py
View file @
f99149b8
import
os
from
collections
import
Counter
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
def
parse_footnotes_by_model
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
md_bookname_save_path
,
debug_mode
=
False
):
...
...
pre_proc/detect_header.py
→
pdf_tools/
pre_proc/detect_header.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
def
parse_headers
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pre_proc/detect_images.py
→
pdf_tools/
pre_proc/detect_images.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
from
libs.boxbase
import
_is_in_or_part_overlap
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
#--------------------------------------- Tool Functions --------------------------------------#
...
...
pre_proc/detect_page_number.py
→
pdf_tools/
pre_proc/detect_page_number.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
def
parse_pageNos
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pre_proc/detect_tables.py
→
pdf_tools/
pre_proc/detect_tables.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
def
parse_tables
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
):
...
...
pre_proc/equations_replace.py
→
pdf_tools/
pre_proc/equations_replace.py
View file @
f99149b8
"""
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
"""
from
libs.commons
import
fitz
from
pdf_tools.
libs.commons
import
fitz
import
json
import
os
from
pathlib
import
Path
...
...
pre_proc/fix_image.py
→
pdf_tools/
pre_proc/fix_image.py
View file @
f99149b8
...
...
@@ -2,10 +2,9 @@
import
re
from
libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
_is_in
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
from
loguru
import
logger
from
pdf_tools.libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
from
libs.textbase
import
get_text_block_base_info
from
pdf_tools.
libs.textbase
import
get_text_block_base_info
def
fix_image_vertical
(
image_bboxes
:
list
,
text_blocks
:
list
):
"""
...
...
pre_proc/fix_table.py
→
pdf_tools/
pre_proc/fix_table.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
import
re
from
libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
# json
from
pdf_tools.
libs.boxbase
import
_is_in_or_part_overlap
,
_is_part_overlap
,
find_bottom_nearest_text_bbox
,
find_left_nearest_text_bbox
,
find_right_nearest_text_bbox
,
find_top_nearest_text_bbox
# json
## version 2
...
...
pre_proc/main_text_font.py
→
pdf_tools/
pre_proc/main_text_font.py
View file @
f99149b8
File moved
pre_proc/pdf_pre_filter.py
→
pdf_tools/
pre_proc/pdf_pre_filter.py
View file @
f99149b8
from
libs.commons
import
fitz
from
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
libs.drop_reason
import
DropReason
from
pdf_tools.
libs.commons
import
fitz
from
pdf_tools.
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
pdf_tools.
libs.drop_reason
import
DropReason
def
__area
(
box
):
...
...
test/assets/more_para_test_samples/gift_files.txt
→
pdf_tools/pre_proc/post_layout_split.py
View file @
f99149b8
File moved
pre_proc/remove_colored_strip_bbox.py
→
pdf_tools/
pre_proc/remove_colored_strip_bbox.py
View file @
f99149b8
from
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
pdf_tools.
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
loguru
import
logger
from
libs.drop_tag
import
COLOR_BG_HEADER_TXT_BLOCK
from
pdf_tools.
libs.drop_tag
import
COLOR_BG_HEADER_TXT_BLOCK
def
__area
(
box
):
...
...
pre_proc/remove_footer_header.py
→
pdf_tools/
pre_proc/remove_footer_header.py
View file @
f99149b8
import
re
from
libs.boxbase
import
_is_in_or_part_overlap
from
pdf_tools.
libs.boxbase
import
_is_in_or_part_overlap
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
...
...
pre_proc/remove_rotate_bbox.py
→
pdf_tools/
pre_proc/remove_rotate_bbox.py
View file @
f99149b8
import
json
import
math
from
libs.boxbase
import
is_vbox_on_side
from
pdf_tools.
libs.boxbase
import
is_vbox_on_side
def
detect_non_horizontal_texts
(
result_dict
):
...
...
@@ -84,7 +82,7 @@ def detect_non_horizontal_texts(result_dict):
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
import
string
,
re
import
re
def
__is_a_word
(
sentence
):
# 如果输入是中文并且长度为1,则返回True
...
...
pre_proc/resolve_bbox_conflict.py
→
pdf_tools/
pre_proc/resolve_bbox_conflict.py
View file @
f99149b8
...
...
@@ -5,7 +5,7 @@
2. 然后去掉出现在文字blcok上的图片bbox
"""
from
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
,
calculate_iou
,
calculate_overlap_area_2_minbox_area_ratio
from
pdf_tools.
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
...
...
pre_proc/statistics.py
→
pdf_tools/
pre_proc/statistics.py
View file @
f99149b8
File moved
test/assets/more_para_test_samples/
zlib
_files.txt
→
test
s
/assets/more_para_test_samples/
gift
_files.txt
View file @
f99149b8
File moved
test/assets/more_para_test_samples/scihub_files.txt
→
test
s
/assets/more_para_test_samples/scihub_files.txt
View file @
f99149b8
File moved
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment