Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f99149b8
Commit
f99149b8
authored
Mar 01, 2024
by
赵小蒙
Browse files
重构目录结构
parent
59bc15e0
Changes
133
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
13 additions
and
18 deletions
+13
-18
pdf_tools/layout/layout_det_utils.py
pdf_tools/layout/layout_det_utils.py
+2
-2
pdf_tools/layout/layout_sort.py
pdf_tools/layout/layout_sort.py
+3
-4
pdf_tools/layout/layout_spiler_recog.py
pdf_tools/layout/layout_spiler_recog.py
+2
-2
pdf_tools/layout/mcol_sort.py
pdf_tools/layout/mcol_sort.py
+1
-2
pdf_tools/libs/__init__.py
pdf_tools/libs/__init__.py
+0
-0
pdf_tools/libs/boxbase.py
pdf_tools/libs/boxbase.py
+0
-0
pdf_tools/libs/calc_span_stats.py
pdf_tools/libs/calc_span_stats.py
+0
-0
pdf_tools/libs/commons.py
pdf_tools/libs/commons.py
+0
-0
pdf_tools/libs/drop_reason.py
pdf_tools/libs/drop_reason.py
+0
-0
pdf_tools/libs/drop_tag.py
pdf_tools/libs/drop_tag.py
+0
-0
pdf_tools/libs/json_compressor.py
pdf_tools/libs/json_compressor.py
+0
-0
pdf_tools/libs/language.py
pdf_tools/libs/language.py
+0
-0
pdf_tools/libs/markdown_utils.py
pdf_tools/libs/markdown_utils.py
+0
-0
pdf_tools/libs/nlp_utils.py
pdf_tools/libs/nlp_utils.py
+1
-1
pdf_tools/libs/pdf_image_tools.py
pdf_tools/libs/pdf_image_tools.py
+2
-2
pdf_tools/libs/safe_filename.py
pdf_tools/libs/safe_filename.py
+0
-0
pdf_tools/libs/textbase.py
pdf_tools/libs/textbase.py
+0
-0
pdf_tools/libs/vis_utils.py
pdf_tools/libs/vis_utils.py
+1
-3
pdf_tools/para/__init__.py
pdf_tools/para/__init__.py
+0
-0
pdf_tools/para/block_continuation_processor.py
pdf_tools/para/block_continuation_processor.py
+1
-2
No files found.
layout/layout_det_utils.py
→
pdf_tools/
layout/layout_det_utils.py
View file @
f99149b8
from
layout.bbox_sort
import
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_EXT_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
from
pdf_tools.
layout.bbox_sort
import
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
from
libs.boxbase
import
_is_bottom_full_overlap
,
_left_intersect
,
_right_intersect
from
pdf_tools.
libs.boxbase
import
_is_bottom_full_overlap
,
_left_intersect
,
_right_intersect
def
find_all_left_bbox_direct
(
this_bbox
,
all_bboxes
)
->
list
:
def
find_all_left_bbox_direct
(
this_bbox
,
all_bboxes
)
->
list
:
...
...
layout/layout_sort.py
→
pdf_tools/
layout/layout_sort.py
View file @
f99149b8
...
@@ -2,11 +2,10 @@
...
@@ -2,11 +2,10 @@
对pdf上的box进行layout识别,并对内部组成的box进行排序
对pdf上的box进行layout识别,并对内部组成的box进行排序
"""
"""
import
json
from
loguru
import
logger
from
loguru
import
logger
from
layout.bbox_sort
import
CONTENT_IDX
,
CONTENT_TYPE_IDX
,
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_EXT_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
,
paper_bbox_sort
from
pdf_tools.
layout.bbox_sort
import
CONTENT_IDX
,
CONTENT_TYPE_IDX
,
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_EXT_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
,
paper_bbox_sort
from
layout.layout_det_utils
import
find_all_left_bbox_direct
,
find_all_right_bbox_direct
,
find_bottom_bbox_direct_from_left_edge
,
find_bottom_bbox_direct_from_right_edge
,
find_top_bbox_direct_from_left_edge
,
find_top_bbox_direct_from_right_edge
,
find_all_top_bbox_direct
,
find_all_bottom_bbox_direct
,
get_left_edge_bboxes
,
get_right_edge_bboxes
from
pdf_tools.
layout.layout_det_utils
import
find_all_left_bbox_direct
,
find_all_right_bbox_direct
,
find_bottom_bbox_direct_from_left_edge
,
find_bottom_bbox_direct_from_right_edge
,
find_top_bbox_direct_from_left_edge
,
find_top_bbox_direct_from_right_edge
,
find_all_top_bbox_direct
,
find_all_bottom_bbox_direct
,
get_left_edge_bboxes
,
get_right_edge_bboxes
from
libs.boxbase
import
get_bbox_in_boundry
from
pdf_tools.
libs.boxbase
import
get_bbox_in_boundry
LAYOUT_V
=
"V"
LAYOUT_V
=
"V"
...
...
layout/layout_spiler_recog.py
→
pdf_tools/
layout/layout_spiler_recog.py
View file @
f99149b8
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
"""
"""
import
os
import
os
from
libs.commons
import
fitz
from
pdf_tools.
libs.commons
import
fitz
from
libs.boxbase
import
_is_in_or_part_overlap
from
pdf_tools.
libs.boxbase
import
_is_in_or_part_overlap
def
__rect_filter_by_width
(
rect
,
page_w
,
page_h
):
def
__rect_filter_by_width
(
rect
,
page_w
,
page_h
):
...
...
layout/mcol_sort.py
→
pdf_tools/
layout/mcol_sort.py
View file @
f99149b8
...
@@ -49,9 +49,8 @@ Usage
...
@@ -49,9 +49,8 @@ Usage
print(page.get_text(clip=rect, sort=True))
print(page.get_text(clip=rect, sort=True))
----------------------------------------------------------------------------------
----------------------------------------------------------------------------------
"""
"""
import
os
import
sys
import
sys
from
libs.commons
import
fitz
from
pdf_tools.
libs.commons
import
fitz
def
column_boxes
(
page
,
footer_margin
=
50
,
header_margin
=
50
,
no_image_text
=
True
):
def
column_boxes
(
page
,
footer_margin
=
50
,
header_margin
=
50
,
no_image_text
=
True
):
...
...
p
ara
/__init__.py
→
p
df_tools/libs
/__init__.py
View file @
f99149b8
File moved
libs/boxbase.py
→
pdf_tools/
libs/boxbase.py
View file @
f99149b8
File moved
libs/calc_span_stats.py
→
pdf_tools/
libs/calc_span_stats.py
View file @
f99149b8
File moved
libs/commons.py
→
pdf_tools/
libs/commons.py
View file @
f99149b8
File moved
libs/drop_reason.py
→
pdf_tools/
libs/drop_reason.py
View file @
f99149b8
File moved
libs/drop_tag.py
→
pdf_tools/
libs/drop_tag.py
View file @
f99149b8
File moved
libs/json_compressor.py
→
pdf_tools/
libs/json_compressor.py
View file @
f99149b8
File moved
libs/language.py
→
pdf_tools/
libs/language.py
View file @
f99149b8
File moved
libs/markdown_utils.py
→
pdf_tools/
libs/markdown_utils.py
View file @
f99149b8
File moved
libs/nlp_utils.py
→
pdf_tools/
libs/nlp_utils.py
View file @
f99149b8
...
@@ -10,7 +10,7 @@ import spacy
...
@@ -10,7 +10,7 @@ import spacy
import
en_core_web_sm
import
en_core_web_sm
import
zh_core_web_sm
import
zh_core_web_sm
from
libs.language
import
detect_lang
from
pdf_tools.
libs.language
import
detect_lang
class
NLPModels
:
class
NLPModels
:
...
...
libs/pdf_image_tools.py
→
pdf_tools/
libs/pdf_image_tools.py
View file @
f99149b8
...
@@ -4,9 +4,9 @@ from typing import Tuple
...
@@ -4,9 +4,9 @@ from typing import Tuple
import
io
import
io
# from app.common.s3 import get_s3_client
# from app.common.s3 import get_s3_client
from
libs.commons
import
fitz
from
pdf_tools.
libs.commons
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
libs.commons
import
parse_bucket_key
,
join_path
from
pdf_tools.
libs.commons
import
parse_bucket_key
,
join_path
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
...
...
libs/safe_filename.py
→
pdf_tools/
libs/safe_filename.py
View file @
f99149b8
File moved
libs/textbase.py
→
pdf_tools/
libs/textbase.py
View file @
f99149b8
File moved
libs/vis_utils.py
→
pdf_tools/
libs/vis_utils.py
View file @
f99149b8
from
libs.commons
import
fitz
from
pdf_tools.
libs.commons
import
fitz
import
os
import
os
from
loguru
import
logger
from
layout.bbox_sort
import
CONTENT_TYPE_IDX
def
draw_bbox_on_page
(
raw_pdf_doc
:
fitz
.
Document
,
paras_dict
:
dict
,
save_path
:
str
):
def
draw_bbox_on_page
(
raw_pdf_doc
:
fitz
.
Document
,
paras_dict
:
dict
,
save_path
:
str
):
...
...
p
ipeline
/__init__.py
→
p
df_tools/para
/__init__.py
View file @
f99149b8
File moved
para/block_continuation_processor.py
→
pdf_tools/
para/block_continuation_processor.py
View file @
f99149b8
import
os
import
os
import
sys
import
unicodedata
import
unicodedata
from
para.commons
import
*
from
pdf_tools.
para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
if
sys
.
version_info
[
0
]
>=
3
:
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment