Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f99149b8
Commit
f99149b8
authored
Mar 01, 2024
by
赵小蒙
Browse files
重构目录结构
parent
59bc15e0
Changes
133
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
34 additions
and
52 deletions
+34
-52
.gitignore
.gitignore
+1
-1
demo/download.py
demo/download.py
+1
-1
demo/draw_bbox.py
demo/draw_bbox.py
+1
-1
demo/pdf2md.py
demo/pdf2md.py
+3
-3
othoers/check_inline_formula.py
othoers/check_inline_formula.py
+1
-1
othoers/pdf2json_infer.py
othoers/pdf2json_infer.py
+7
-7
othoers/pdf2text_evaluatePdfLayout.py
othoers/pdf2text_evaluatePdfLayout.py
+1
-5
othoers/pdf2text_getNumberOfColumn.py
othoers/pdf2text_getNumberOfColumn.py
+1
-1
othoers/pdf2text_recogFootnoteLine.py
othoers/pdf2text_recogFootnoteLine.py
+2
-9
othoers/pdf2text_recogPara_v2.py
othoers/pdf2text_recogPara_v2.py
+3
-3
othoers/pdf2text_recogTitle.py
othoers/pdf2text_recogTitle.py
+1
-5
othoers/vali_bbox_sort.py
othoers/vali_bbox_sort.py
+2
-2
pdf_tools/__init__.py
pdf_tools/__init__.py
+0
-0
pdf_tools/dict2md/__init__.py
pdf_tools/dict2md/__init__.py
+0
-0
pdf_tools/dict2md/mkcontent.py
pdf_tools/dict2md/mkcontent.py
+1
-2
pdf_tools/filter/__init__.py
pdf_tools/filter/__init__.py
+0
-0
pdf_tools/filter/pdf_classify_by_type.py
pdf_tools/filter/pdf_classify_by_type.py
+2
-2
pdf_tools/filter/pdf_meta_scan.py
pdf_tools/filter/pdf_meta_scan.py
+4
-6
pdf_tools/layout/__init__.py
pdf_tools/layout/__init__.py
+0
-0
pdf_tools/layout/bbox_sort.py
pdf_tools/layout/bbox_sort.py
+3
-3
No files found.
.gitignore
View file @
f99149b8
...
...
@@ -31,6 +31,6 @@ tmp/
tmp
.vscode
.vscode/
/test/
/test
s
/
/app/pdf_toolbox/test/test_bookname.txt
demo/download.py
View file @
f99149b8
...
...
@@ -2,7 +2,7 @@ import json
import
os
from
tqdm
import
tqdm
from
libs.common
s
import
join_path
from
pdf_tools.lib
s
import
join_path
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
...
...
demo/draw_bbox.py
View file @
f99149b8
from
libs.common
s
import
fitz
# PyMuPDF
from
pdf_tools.lib
s
import
fitz
# PyMuPDF
# PDF文件路径
pdf_path
=
"D:
\\
project
\\
20231108code-clean
\\
code-clean
\\
tmp
\\
unittest
\\
download-pdfs
\\
scihub
\\
scihub_53700000
\\
libgen.scimag53724000-53724999.zip_10.1097
\\
00129191-200509000-00018.pdf"
...
...
demo/pdf2md.py
View file @
f99149b8
...
...
@@ -5,9 +5,9 @@ from pathlib import Path
import
click
from
loguru
import
logger
from
libs.common
s
import
join_path
from
dict2md.mkcontent
import
mk_mm_markdown
from
p
ipeline.pdf_parse_by_model
import
parse_pdf_by_model
from
pdf_tools.lib
s
import
join_path
from
pdf_tools.
dict2md.mkcontent
import
mk_mm_markdown
from
p
df_tools.pipeline
import
parse_pdf_by_model
...
...
othoers/check_inline_formula.py
View file @
f99149b8
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from
libs.common
s
import
fitz
from
pdf_tools.lib
s
import
fitz
def
check_inline_formula
(
page
,
inline_formula_boxes
):
...
...
othoers/pdf2json_infer.py
View file @
f99149b8
...
...
@@ -3,7 +3,7 @@ from typing import Tuple
import
os
import
boto3
,
json
from
botocore.config
import
Config
from
libs.common
s
import
fitz
from
pdf_tools.lib
s
import
fitz
from
loguru
import
logger
from
pathlib
import
Path
from
tqdm
import
tqdm
...
...
@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from
layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
p
re_proc.detect_images
import
parse_images
# 获取figures的bbox
from
pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
p
re_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
pdf_tools.
layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
p
df_tools.pre_proc
import
parse_images
# 获取figures的bbox
from
pdf_tools.
pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
p
df_tools.pre_proc
import
parse_equations
# 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from
post_proc.detect_para
import
process_blocks_per_page
from
libs.common
s
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
from
pdf_tools.
post_proc.detect_para
import
process_blocks_per_page
from
pdf_tools.lib
s
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_profile
:
str
):
...
...
othoers/pdf2text_evaluatePdfLayout.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs
import
fitz
# pyMuPDF库
def
calculate_overlapRatio_between_rect1_and_rect2
(
L1
:
float
,
U1
:
float
,
R1
:
float
,
D1
:
float
,
L2
:
float
,
U2
:
float
,
R2
:
float
,
D2
:
float
)
->
(
float
,
float
):
...
...
othoers/pdf2text_getNumberOfColumn.py
View file @
f99149b8
from
libs.common
s
import
fitz
from
pdf_tools.lib
s
import
fitz
from
typing
import
List
...
...
othoers/pdf2text_recogFootnoteLine.py
View file @
f99149b8
import
io
import
re
import
os
import
json
from
libs.boxbase
import
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
libs.commons
import
fitz
from
fitz
import
Point
from
pprint
import
pprint
import
pickle
from
pdf_tools.libs
import
_is_in_or_part_overlap
from
pdf_tools.libs
import
fitz
import
collections
from
typing
import
List
def
calculate_overlapRatio_between_rect1_and_rect2
(
L1
:
float
,
U1
:
float
,
R1
:
float
,
D1
:
float
,
L2
:
float
,
U2
:
float
,
R2
:
float
,
D2
:
float
)
->
(
float
,
float
):
...
...
othoers/pdf2text_recogPara_v2.py
View file @
f99149b8
...
...
@@ -11,8 +11,8 @@ import numpy as np
from
termcolor
import
cprint
from
libs.common
s
import
fitz
from
libs.nlp_util
s
import
NLPModels
from
pdf_tools.lib
s
import
fitz
from
pdf_tools.lib
s
import
NLPModels
if
sys
.
version_info
[
0
]
>=
3
:
...
...
@@ -3478,7 +3478,7 @@ Params:
if
__name__
==
"__main__"
:
DEFAULT_PDF_PATH
=
(
"app/pdf_toolbox/test/assets/paper/paper.pdf"
if
os
.
name
!=
"nt"
else
"app
\\
pdf_toolbox
\\
test
\\
assets
\\
paper
\\
paper.pdf"
"app/pdf_toolbox/test
s
/assets/paper/paper.pdf"
if
os
.
name
!=
"nt"
else
"app
\\
pdf_toolbox
\\
test
s
\\
assets
\\
paper
\\
paper.pdf"
)
input_pdf_path
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
>
1
else
DEFAULT_PDF_PATH
output_pdf_path
=
sys
.
argv
[
2
]
if
len
(
sys
.
argv
)
>
2
else
input_pdf_path
.
split
(
"."
)[
0
]
+
"_recogPara.pdf"
...
...
othoers/pdf2text_recogTitle.py
View file @
f99149b8
import
os
import
collections
# 统计库
import
re
# 正则
from
libs.commons
import
fitz
# pyMuPDF库
import
json
# json
from
pdf_tools.libs.commons
import
fitz
# pyMuPDF库
def
parse_titles
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
exclude_bboxes
):
...
...
othoers/vali_bbox_sort.py
View file @
f99149b8
import
numpy
as
np
import
tqdm
import
json
from
validation
import
cal_edit_distance
,
format_gt_bbox
,
label_match
,
detect_val
from
layout.layout_sort
import
sort_with_layout
from
validation
import
cal_edit_distance
,
format_gt_bbox
from
pdf_tools.
layout.layout_sort
import
sort_with_layout
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
...
...
dict2md
/__init__.py
→
pdf_tools
/__init__.py
View file @
f99149b8
File moved
filter
/__init__.py
→
pdf_tools/dict2md
/__init__.py
View file @
f99149b8
File moved
dict2md/mkcontent.py
→
pdf_tools/
dict2md/mkcontent.py
View file @
f99149b8
import
re
import
math
from
loguru
import
logger
from
libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
from
pdf_tools.
libs.boxbase
import
find_bottom_nearest_text_bbox
,
find_top_nearest_text_bbox
def
mk_nlp_markdown
(
para_dict
:
dict
):
...
...
layout
/__init__.py
→
pdf_tools/filter
/__init__.py
View file @
f99149b8
File moved
filter/pdf_classify_by_type.py
→
pdf_tools/
filter/pdf_classify_by_type.py
View file @
f99149b8
...
...
@@ -16,8 +16,8 @@ from collections import Counter
import
click
import
numpy
as
np
from
libs.commons
import
mymax
,
get_top_percent_list
from
filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
from
pdf_tools.
libs.commons
import
mymax
,
get_top_percent_list
from
pdf_tools.
filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
TEXT_LEN_THRESHOLD
=
100
AVG_TEXT_LEN_THRESHOLD
=
200
...
...
filter/pdf_meta_scan.py
→
pdf_tools/
filter/pdf_meta_scan.py
View file @
f99149b8
...
...
@@ -2,18 +2,16 @@
输入: s3路径,每行一个
输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置
"""
import
math
import
sys
import
click
from
libs.commons
import
read_file
,
mymax
,
get_top_percent_list
import
json
from
libs.commons
import
fitz
from
pdf_tools.libs.commons
import
read_file
,
mymax
,
get_top_percent_list
from
pdf_tools.libs.commons
import
fitz
from
loguru
import
logger
from
collections
import
Counter
from
libs.drop_reason
import
DropReason
from
libs.language
import
detect_lang
from
pdf_tools.
libs.drop_reason
import
DropReason
from
pdf_tools.
libs.language
import
detect_lang
scan_max_page
=
50
junk_limit_min
=
10
...
...
libs
/__init__.py
→
pdf_tools/layout
/__init__.py
View file @
f99149b8
File moved
layout/bbox_sort.py
→
pdf_tools/
layout/bbox_sort.py
View file @
f99149b8
...
...
@@ -3,9 +3,9 @@
from
layout.layout_spiler_recog
import
get_spilter_of_page
from
libs.boxbase
import
_is_bottom_full_overlap
,
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
libs.commons
import
mymax
from
pdf_tools.
layout.layout_spiler_recog
import
get_spilter_of_page
from
pdf_tools.
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_vertical_full_overlap
from
pdf_tools.
libs.commons
import
mymax
X0_IDX
=
0
Y0_IDX
=
1
...
...
Prev
1
2
3
4
5
…
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment