Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d5dbed73
Commit
d5dbed73
authored
Mar 01, 2024
by
赵小蒙
Browse files
目录重构
parent
7c7910e4
Changes
85
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
23 additions
and
23 deletions
+23
-23
demo/download.py
demo/download.py
+1
-1
demo/draw_bbox.py
demo/draw_bbox.py
+1
-1
demo/pdf2md.py
demo/pdf2md.py
+3
-3
magic_pdf/__init__.py
magic_pdf/__init__.py
+0
-0
magic_pdf/dict2md/__init__.py
magic_pdf/dict2md/__init__.py
+0
-0
magic_pdf/dict2md/mkcontent.py
magic_pdf/dict2md/mkcontent.py
+1
-1
magic_pdf/filter/__init__.py
magic_pdf/filter/__init__.py
+0
-0
magic_pdf/filter/pdf_classify_by_type.py
magic_pdf/filter/pdf_classify_by_type.py
+2
-2
magic_pdf/filter/pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+4
-4
magic_pdf/layout/__init__.py
magic_pdf/layout/__init__.py
+0
-0
magic_pdf/layout/bbox_sort.py
magic_pdf/layout/bbox_sort.py
+3
-3
magic_pdf/layout/layout_det_utils.py
magic_pdf/layout/layout_det_utils.py
+2
-2
magic_pdf/layout/layout_sort.py
magic_pdf/layout/layout_sort.py
+3
-3
magic_pdf/layout/layout_spiler_recog.py
magic_pdf/layout/layout_spiler_recog.py
+2
-2
magic_pdf/layout/mcol_sort.py
magic_pdf/layout/mcol_sort.py
+1
-1
magic_pdf/libs/__init__.py
magic_pdf/libs/__init__.py
+0
-0
magic_pdf/libs/boxbase.py
magic_pdf/libs/boxbase.py
+0
-0
magic_pdf/libs/calc_span_stats.py
magic_pdf/libs/calc_span_stats.py
+0
-0
magic_pdf/libs/commons.py
magic_pdf/libs/commons.py
+0
-0
magic_pdf/libs/drop_reason.py
magic_pdf/libs/drop_reason.py
+0
-0
No files found.
demo/download.py
View file @
d5dbed73
...
@@ -2,7 +2,7 @@ import json
...
@@ -2,7 +2,7 @@ import json
import
os
import
os
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
pdf_tools
.libs
import
join_path
from
magic_pdf
.libs
import
join_path
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json'
,
'r'
)
as
f
:
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
samples
=
json
.
load
(
f
)
...
...
demo/draw_bbox.py
View file @
d5dbed73
from
pdf_tools
.libs
import
fitz
# PyMuPDF
from
magic_pdf
.libs
import
fitz
# PyMuPDF
# PDF文件路径
# PDF文件路径
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018.pdf"
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018.pdf"
...
...
demo/pdf2md.py
View file @
d5dbed73
...
@@ -5,9 +5,9 @@ from pathlib import Path
...
@@ -5,9 +5,9 @@ from pathlib import Path
import
click
import
click
from
loguru
import
logger
from
loguru
import
logger
from
pdf_tools
.libs
import
join_path
from
magic_pdf
.libs
import
join_path
from
pdf_tools
.dict2md.mkcontent
import
mk_mm_markdown
from
magic_pdf
.dict2md.mkcontent
import
mk_mm_markdown
from
pdf_tools
.pipeline
import
parse_pdf_by_model
from
magic_pdf
.pipeline
import
parse_pdf_by_model
...
...
pdf_tools
/__init__.py
→
magic_pdf
/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/dict2md/__init__.py
→
magic_pdf
/dict2md/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/dict2md/mkcontent.py
→
magic_pdf
/dict2md/mkcontent.py
View file @
d5dbed73
import
math
import
math
from
loguru
import
logger
from
loguru
import
logger
from
pdf_tools
.libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
from
magic_pdf
.libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
def
mk_nlp_markdown
(
para_dict
:
dict
):
def
mk_nlp_markdown
(
para_dict
:
dict
):
...
...
pdf_tools
/filter/__init__.py
→
magic_pdf
/filter/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/filter/pdf_classify_by_type.py
→
magic_pdf
/filter/pdf_classify_by_type.py
View file @
d5dbed73
...
@@ -16,8 +16,8 @@ from collections import Counter
...
@@ -16,8 +16,8 @@ from collections import Counter
import
click
import
click
import
numpy
as
np
import
numpy
as
np
from
pdf_tools
.libs.commons
import
mymax
,
get_top_percent_list
from
magic_pdf
.libs.commons
import
mymax
,
get_top_percent_list
from
pdf_tools
.filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
from
magic_pdf
.filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
TEXT_LEN_THRESHOLD
=
100
TEXT_LEN_THRESHOLD
=
100
AVG_TEXT_LEN_THRESHOLD
=
200
AVG_TEXT_LEN_THRESHOLD
=
200
...
...
pdf_tools
/filter/pdf_meta_scan.py
→
magic_pdf
/filter/pdf_meta_scan.py
View file @
d5dbed73
...
@@ -5,13 +5,13 @@
...
@@ -5,13 +5,13 @@
import
sys
import
sys
import
click
import
click
from
pdf_tools
.libs.commons
import
read_file
,
mymax
,
get_top_percent_list
from
magic_pdf
.libs.commons
import
read_file
,
mymax
,
get_top_percent_list
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
collections
import
Counter
from
collections
import
Counter
from
pdf_tools
.libs.drop_reason
import
DropReason
from
magic_pdf
.libs.drop_reason
import
DropReason
from
pdf_tools
.libs.language
import
detect_lang
from
magic_pdf
.libs.language
import
detect_lang
scan_max_page
=
50
scan_max_page
=
50
junk_limit_min
=
10
junk_limit_min
=
10
...
...
pdf_tools
/layout/__init__.py
→
magic_pdf
/layout/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/layout/bbox_sort.py
→
magic_pdf
/layout/bbox_sort.py
View file @
d5dbed73
...
@@ -3,9 +3,9 @@
...
@@ -3,9 +3,9 @@
from
pdf_tools
.layout.layout_spiler_recog
import
get_spilter_of_page
from
magic_pdf
.layout.layout_spiler_recog
import
get_spilter_of_page
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
pdf_tools
.libs.commons
import
mymax
from
magic_pdf
.libs.commons
import
mymax
X0_IDX
=
0
X0_IDX
=
0
Y0_IDX
=
1
Y0_IDX
=
1
...
...
pdf_tools
/layout/layout_det_utils.py
→
magic_pdf
/layout/layout_det_utils.py
View file @
d5dbed73
from
pdf_tools
.layout.bbox_sort
import
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
from
magic_pdf
.layout.bbox_sort
import
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
from
pdf_tools
.libs.boxbase
import
_is_bottom_full_overlap
,
_left_intersect
,
_right_intersect
from
magic_pdf
.libs.boxbase
import
_is_bottom_full_overlap
,
_left_intersect
,
_right_intersect
def
find_all_left_bbox_direct
(
this_bbox
,
all_bboxes
)
->
list
:
def
find_all_left_bbox_direct
(
this_bbox
,
all_bboxes
)
->
list
:
...
...
pdf_tools
/layout/layout_sort.py
→
magic_pdf
/layout/layout_sort.py
View file @
d5dbed73
...
@@ -3,9 +3,9 @@
...
@@ -3,9 +3,9 @@
"""
"""
from
loguru
import
logger
from
loguru
import
logger
from
pdf_tools
.layout.bbox_sort
import
CONTENT_IDX
,
CONTENT_TYPE_IDX
,
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_EXT_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
,
paper_bbox_sort
from
magic_pdf
.layout.bbox_sort
import
CONTENT_IDX
,
CONTENT_TYPE_IDX
,
X0_EXT_IDX
,
X0_IDX
,
X1_EXT_IDX
,
X1_IDX
,
Y0_EXT_IDX
,
Y0_IDX
,
Y1_EXT_IDX
,
Y1_IDX
,
paper_bbox_sort
from
pdf_tools
.layout.layout_det_utils
import
find_all_left_bbox_direct
,
find_all_right_bbox_direct
,
find_bottom_bbox_direct_from_left_edge
,
find_bottom_bbox_direct_from_right_edge
,
find_top_bbox_direct_from_left_edge
,
find_top_bbox_direct_from_right_edge
,
find_all_top_bbox_direct
,
find_all_bottom_bbox_direct
,
get_left_edge_bboxes
,
get_right_edge_bboxes
from
magic_pdf
.layout.layout_det_utils
import
find_all_left_bbox_direct
,
find_all_right_bbox_direct
,
find_bottom_bbox_direct_from_left_edge
,
find_bottom_bbox_direct_from_right_edge
,
find_top_bbox_direct_from_left_edge
,
find_top_bbox_direct_from_right_edge
,
find_all_top_bbox_direct
,
find_all_bottom_bbox_direct
,
get_left_edge_bboxes
,
get_right_edge_bboxes
from
pdf_tools
.libs.boxbase
import
get_bbox_in_boundry
from
magic_pdf
.libs.boxbase
import
get_bbox_in_boundry
LAYOUT_V
=
"V"
LAYOUT_V
=
"V"
...
...
pdf_tools
/layout/layout_spiler_recog.py
→
magic_pdf
/layout/layout_spiler_recog.py
View file @
d5dbed73
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
"""
"""
import
os
import
os
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
def
__rect_filter_by_width
(
rect
,
page_w
,
page_h
):
def
__rect_filter_by_width
(
rect
,
page_w
,
page_h
):
...
...
pdf_tools
/layout/mcol_sort.py
→
magic_pdf
/layout/mcol_sort.py
View file @
d5dbed73
...
@@ -50,7 +50,7 @@ Usage
...
@@ -50,7 +50,7 @@ Usage
----------------------------------------------------------------------------------
----------------------------------------------------------------------------------
"""
"""
import
sys
import
sys
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
def
column_boxes
(
page
,
footer_margin
=
50
,
header_margin
=
50
,
no_image_text
=
True
):
def
column_boxes
(
page
,
footer_margin
=
50
,
header_margin
=
50
,
no_image_text
=
True
):
...
...
pdf_tools
/libs/__init__.py
→
magic_pdf
/libs/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/libs/boxbase.py
→
magic_pdf
/libs/boxbase.py
View file @
d5dbed73
File moved
pdf_tools
/libs/calc_span_stats.py
→
magic_pdf
/libs/calc_span_stats.py
View file @
d5dbed73
File moved
pdf_tools
/libs/commons.py
→
magic_pdf
/libs/commons.py
View file @
d5dbed73
File moved
pdf_tools
/libs/drop_reason.py
→
magic_pdf
/libs/drop_reason.py
View file @
d5dbed73
File moved
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment