Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f99149b8
Commit
f99149b8
authored
Mar 01, 2024
by
赵小蒙
Browse files
重构目录结构
parent
59bc15e0
Changes
133
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
34 additions
and
52 deletions
+34
-52
.gitignore
.gitignore
+1
-1
demo/download.py
demo/download.py
+1
-1
demo/draw_bbox.py
demo/draw_bbox.py
+1
-1
demo/pdf2md.py
demo/pdf2md.py
+3
-3
othoers/check_inline_formula.py
othoers/check_inline_formula.py
+1
-1
othoers/pdf2json_infer.py
othoers/pdf2json_infer.py
+7
-7
othoers/pdf2text_evaluatePdfLayout.py
othoers/pdf2text_evaluatePdfLayout.py
+1
-5
othoers/pdf2text_getNumberOfColumn.py
othoers/pdf2text_getNumberOfColumn.py
+1
-1
othoers/pdf2text_recogFootnoteLine.py
othoers/pdf2text_recogFootnoteLine.py
+2
-9
othoers/pdf2text_recogPara_v2.py
othoers/pdf2text_recogPara_v2.py
+3
-3
othoers/pdf2text_recogTitle.py
othoers/pdf2text_recogTitle.py
+1
-5
othoers/vali_bbox_sort.py
othoers/vali_bbox_sort.py
+2
-2
pdf_tools/__init__.py
pdf_tools/__init__.py
+0
-0
pdf_tools/dict2md/__init__.py
pdf_tools/dict2md/__init__.py
+0
-0
pdf_tools/dict2md/mkcontent.py
pdf_tools/dict2md/mkcontent.py
+1
-2
pdf_tools/filter/__init__.py
pdf_tools/filter/__init__.py
+0
-0
pdf_tools/filter/pdf_classify_by_type.py
pdf_tools/filter/pdf_classify_by_type.py
+2
-2
pdf_tools/filter/pdf_meta_scan.py
pdf_tools/filter/pdf_meta_scan.py
+4
-6
pdf_tools/layout/__init__.py
pdf_tools/layout/__init__.py
+0
-0
pdf_tools/layout/bbox_sort.py
pdf_tools/layout/bbox_sort.py
+3
-3
No files found.
.gitignore
View file @
f99149b8
...
...
@@ -31,6 +31,6 @@ tmp/
tmp
.vscode
.vscode/
/test/
/test
s
/
/app/pdf_toolbox/test/test_bookname.txt
demo/download.py
View file @
f99149b8
...
...
@@ -2,7 +2,7 @@ import json
import
os
from
tqdm
import
tqdm
from
libs.common
s
import
join_path
from
pdf_tools.lib
s
import
join_path
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
...
...
demo/draw_bbox.py
View file @
f99149b8
from
libs.common
s
import
fitz
# PyMuPDF
from
pdf_tools.lib
s
import
fitz
# PyMuPDF
# PDF文件路径
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018.pdf"
...
...
demo/pdf2md.py
View file @
f99149b8
...
...
@@ -5,9 +5,9 @@ from pathlib import Path
import
click
from
loguru
import
logger
from
libs.common
s
import
join_path
from
dict2md.mkcontent
import
mk_mm_markdown
from
p
ipeline.pdf_parse_by_model
import
parse_pdf_by_model
from
pdf_tools.lib
s
import
join_path
from
pdf_tools.
dict2md.mkcontent
import
mk_mm_markdown
from
p
df_tools.pipeline
import
parse_pdf_by_model
...
...
othoers/check_inline_formula.py
View file @
f99149b8
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from
libs.common
s
import
fitz
from
pdf_tools.lib
s
import
fitz
def
check_inline_formula
(
page
,
inline_formula_boxes
):
...
...
othoers/pdf2json_infer.py
View file @
f99149b8
...
...
@@ -3,7 +3,7 @@ from typing import Tuple
import
os
import
boto3
,
json
from
botocore.config
import
Config
from
libs.common
s
import
fitz
from
pdf_tools.lib
s
import
fitz
from
loguru
import
logger
from
pathlib
import
Path
from
tqdm
import
tqdm
...
...
@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from
layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
p
re_proc.detect_images
import
parse_images
# 获取figures的bbox
from
pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
p
re_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
pdf_tools.
layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
p
df_tools.pre_proc
import
parse_images
# 获取figures的bbox
from
pdf_tools.
pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
p
df_tools.pre_proc
import
parse_equations
# 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from
post_proc.detect_para
import
process_blocks_per_page
from
libs.common
s
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
from
pdf_tools.
post_proc.detect_para
import
process_blocks_per_page
from
pdf_tools.lib
s
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_profile
:
str
):
...
...
othoers/pdf2text_evaluatePdfLayout.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs
import
fitz
# pyMuPDF库
def
calculate_overlapRatio_between_rect1_and_rect2
(
L1
:
float
,
U1
:
float
,
R1
:
float
,
D1
:
float
,
L2
:
float
,
U2
:
float
,
R2
:
float
,
D2
:
float
)
->
(
float
,
float
):
...
...
othoers/pdf2text_getNumberOfColumn.py
View file @
f99149b8
from
libs.common
s
import
fitz
from
pdf_tools.lib
s
import
fitz
from
typing
import
List
...
...
othoers/pdf2text_recogFootnoteLine.py
View file @
f99149b8
import
io
import
re
import
os
import
json
from
libs.boxbase
import
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
libs.commons
import
fitz
from
fitz
import
Point
from
pprint
import
pprint
import
pickle
from
pdf_tools.libs
import
_is_in_or_part_overlap
from
pdf_tools.libs
import
fitz
import
collections
from
typing
import
List
def
calculate_overlapRatio_between_rect1_and_rect2
(
L1
:
float
,
U1
:
float
,
R1
:
float
,
D1
:
float
,
L2
:
float
,
U2
:
float
,
R2
:
float
,
D2
:
float
)
->
(
float
,
float
):
...
...
othoers/pdf2text_recogPara_v2.py
View file @
f99149b8
...
...
@@ -11,8 +11,8 @@ import numpy as np
from
termcolor
import
cprint
from
libs.common
s
import
fitz
from
libs.nlp_util
s
import
NLPModels
from
pdf_tools.lib
s
import
fitz
from
pdf_tools.lib
s
import
NLPModels
if
sys
.
version_info
[
0
]
>=
3
:
...
...
@@ -3478,7 +3478,7 @@ Params:
if
__name__
==
"__main__"
:
DEFAULT_PDF_PATH
=
(
"app/pdf_toolbox/test/assets/paper/paper.pdf"
if
os
.
name
!=
"nt"
else
"app
\\
pdf_toolbox
\\
test
\\
assets
\\
paper
\\
paper.pdf"
"app/pdf_toolbox/test
s
/assets/paper/paper.pdf"
if
os
.
name
!=
"nt"
else
"app
\\
pdf_toolbox
\\
test
s
\\
assets
\\
paper
\\
paper.pdf"
)
input_pdf_path
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
>
1
else
DEFAULT_PDF_PATH
output_pdf_path
=
sys
.
argv
[
2
]
if
len
(
sys
.
argv
)
>
2
else
input_pdf_path
.
split
(
"."
)[
0
]
+
"_recogPara.pdf"
...
...
othoers/pdf2text_recogTitle.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
def
parse_titles
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
exclude_bboxes
):
...
...
othoers/vali_bbox_sort.py
View file @
f99149b8
import
numpy
as
np
import
tqdm
import
json
from
validation
import
cal_edit_distance
,
format_gt_bbox
,
label_match
,
detect_val
from
layout.layout_sort
import
sort_with_layout
from
validation
import
cal_edit_distance
,
format_gt_bbox
from
pdf_tools.
layout.layout_sort
import
sort_with_layout
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
...
...
dict2md
/__init__.py
→
pdf_tools
/__init__.py
View file @
f99149b8
File moved
filter
/__init__.py
→
pdf_tools/dict2md
/__init__.py
View file @
f99149b8
File moved
dict2md/mkcontent.py
→
pdf_tools/
dict2md/mkcontent.py
View file @
f99149b8
import
re
import
math
from
loguru
import
logger
from
libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
from
pdf_tools.
libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
def
mk_nlp_markdown
(
para_dict
:
dict
):
...
...
layout
/__init__.py
→
pdf_tools/filter
/__init__.py
View file @
f99149b8
File moved
filter/pdf_classify_by_type.py
→
pdf_tools/
filter/pdf_classify_by_type.py
View file @
f99149b8
...
...
@@ -16,8 +16,8 @@ from collections import Counter
import
click
import
numpy
as
np
from
libs.commons
import
mymax
,
get_top_percent_list
from
filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
from
pdf_tools.
libs.commons
import
mymax
,
get_top_percent_list
from
pdf_tools.
filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
TEXT_LEN_THRESHOLD
=
100
AVG_TEXT_LEN_THRESHOLD
=
200
...
...
filter/pdf_meta_scan.py
→
pdf_tools/
filter/pdf_meta_scan.py
View file @
f99149b8
...
...
@@ -2,18 +2,16 @@
输入: s3路径,每行一个
输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置
"""
import
math
import
sys
import
click
from
libs.commons
import
read_file
,
mymax
,
get_top_percent_list
import
json
from
libs.commons
import
fitz
from
pdf_tools.libs.commons
import
read_file
,
mymax
,
get_top_percent_list
from
pdf_tools.libs.commons
import
fitz
from
loguru
import
logger
from
collections
import
Counter
from
libs.drop_reason
import
DropReason
from
libs.language
import
detect_lang
from
pdf_tools.
libs.drop_reason
import
DropReason
from
pdf_tools.
libs.language
import
detect_lang
scan_max_page
=
50
junk_limit_min
=
10
...
...
libs
/__init__.py
→
pdf_tools/layout
/__init__.py
View file @
f99149b8
File moved
layout/bbox_sort.py
→
pdf_tools/
layout/bbox_sort.py
View file @
f99149b8
...
...
@@ -3,9 +3,9 @@
from
layout.layout_spiler_recog
import
get_spilter_of_page
from
libs.boxbase
import
_is_bottom_full_overlap
,
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
libs.commons
import
mymax
from
pdf_tools.
layout.layout_spiler_recog
import
get_spilter_of_page
from
pdf_tools.
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
pdf_tools.
libs.commons
import
mymax
X0_IDX
=
0
Y0_IDX
=
1
...
...
Prev
1
2
3
4
5
…
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment