Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d5dbed73
Commit
d5dbed73
authored
Mar 01, 2024
by
赵小蒙
Browse files
目录重构
parent
7c7910e4
Changes
85
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
30 additions
and
30 deletions
+30
-30
magic_pdf/pre_proc/main_text_font.py
magic_pdf/pre_proc/main_text_font.py
+0
-0
magic_pdf/pre_proc/pdf_pre_filter.py
magic_pdf/pre_proc/pdf_pre_filter.py
+3
-3
magic_pdf/pre_proc/post_layout_split.py
magic_pdf/pre_proc/post_layout_split.py
+0
-0
magic_pdf/pre_proc/remove_colored_strip_bbox.py
magic_pdf/pre_proc/remove_colored_strip_bbox.py
+2
-2
magic_pdf/pre_proc/remove_footer_header.py
magic_pdf/pre_proc/remove_footer_header.py
+1
-1
magic_pdf/pre_proc/remove_rotate_bbox.py
magic_pdf/pre_proc/remove_rotate_bbox.py
+1
-1
magic_pdf/pre_proc/resolve_bbox_conflict.py
magic_pdf/pre_proc/resolve_bbox_conflict.py
+1
-1
magic_pdf/pre_proc/statistics.py
magic_pdf/pre_proc/statistics.py
+0
-0
othoers/check_inline_formula.py
othoers/check_inline_formula.py
+1
-1
othoers/pdf2json_infer.py
othoers/pdf2json_infer.py
+7
-7
othoers/pdf2text_evaluatePdfLayout.py
othoers/pdf2text_evaluatePdfLayout.py
+1
-1
othoers/pdf2text_getNumberOfColumn.py
othoers/pdf2text_getNumberOfColumn.py
+1
-1
othoers/pdf2text_recogFootnoteLine.py
othoers/pdf2text_recogFootnoteLine.py
+2
-2
othoers/pdf2text_recogPara_v2.py
othoers/pdf2text_recogPara_v2.py
+2
-2
othoers/pdf2text_recogTitle.py
othoers/pdf2text_recogTitle.py
+1
-1
othoers/vali_bbox_sort.py
othoers/vali_bbox_sort.py
+1
-1
pdf_tools/pre_proc/__init__.py
pdf_tools/pre_proc/__init__.py
+0
-0
setup.py
setup.py
+2
-2
tests/test_commons.py
tests/test_commons.py
+2
-2
tests/test_metascan_classify/test_classify.py
tests/test_metascan_classify/test_classify.py
+2
-2
No files found.
pdf_tools
/pre_proc/main_text_font.py
→
magic_pdf
/pre_proc/main_text_font.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/pdf_pre_filter.py
→
magic_pdf
/pre_proc/pdf_pre_filter.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
pdf_tools
.libs.drop_reason
import
DropReason
from
magic_pdf
.libs.commons
import
fitz
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
magic_pdf
.libs.drop_reason
import
DropReason
def
__area
(
box
):
...
...
pdf_tools
/pre_proc/post_layout_split.py
→
magic_pdf
/pre_proc/post_layout_split.py
View file @
d5dbed73
File moved
pdf_tools
/pre_proc/remove_colored_strip_bbox.py
→
magic_pdf
/pre_proc/remove_colored_strip_bbox.py
View file @
d5dbed73
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
calculate_overlap_area_2_minbox_area_ratio
from
loguru
import
logger
from
pdf_tools
.libs.drop_tag
import
COLOR_BG_HEADER_TXT_BLOCK
from
magic_pdf
.libs.drop_tag
import
COLOR_BG_HEADER_TXT_BLOCK
def
__area
(
box
):
...
...
pdf_tools
/pre_proc/remove_footer_header.py
→
magic_pdf
/pre_proc/remove_footer_header.py
View file @
d5dbed73
import
re
from
pdf_tools
.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf
.libs.boxbase
import
_is_in_or_part_overlap
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
...
...
pdf_tools
/pre_proc/remove_rotate_bbox.py
→
magic_pdf
/pre_proc/remove_rotate_bbox.py
View file @
d5dbed73
import
math
from
pdf_tools
.libs.boxbase
import
is_vbox_on_side
from
magic_pdf
.libs.boxbase
import
is_vbox_on_side
def
detect_non_horizontal_texts
(
result_dict
):
...
...
pdf_tools
/pre_proc/resolve_bbox_conflict.py
→
magic_pdf
/pre_proc/resolve_bbox_conflict.py
View file @
d5dbed73
...
...
@@ -5,7 +5,7 @@
2. 然后去掉出现在文字blcok上的图片bbox
"""
from
pdf_tools
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
from
magic_pdf
.libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
,
_is_left_overlap
def
resolve_bbox_overlap_conflict
(
images
:
list
,
tables
:
list
,
interline_equations
:
list
,
inline_equations
:
list
,
text_raw_blocks
:
list
):
...
...
pdf_tools
/pre_proc/statistics.py
→
magic_pdf
/pre_proc/statistics.py
View file @
d5dbed73
File moved
othoers/check_inline_formula.py
View file @
d5dbed73
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
def
check_inline_formula
(
page
,
inline_formula_boxes
):
...
...
othoers/pdf2json_infer.py
View file @
d5dbed73
...
...
@@ -3,7 +3,7 @@ from typing import Tuple
import
os
import
boto3
,
json
from
botocore.config
import
Config
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
from
loguru
import
logger
from
pathlib
import
Path
from
tqdm
import
tqdm
...
...
@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from
pdf_tools
.layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
pdf_tools
.pre_proc
import
parse_images
# 获取figures的bbox
from
pdf_tools
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
pdf_tools
.pre_proc
import
parse_equations
# 获取equations的bbox
from
magic_pdf
.layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
magic_pdf
.pre_proc
import
parse_images
# 获取figures的bbox
from
magic_pdf
.pre_proc.detect_tables
import
parse_tables
# 获取tables的bbox
from
magic_pdf
.pre_proc
import
parse_equations
# 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from
pdf_tools
.post_proc.detect_para
import
process_blocks_per_page
from
pdf_tools
.libs
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
from
magic_pdf
.post_proc.detect_para
import
process_blocks_per_page
from
magic_pdf
.libs
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_profile
:
str
):
...
...
othoers/pdf2text_evaluatePdfLayout.py
View file @
d5dbed73
from
pdf_tools
.libs
import
fitz
# pyMuPDF库
from
magic_pdf
.libs
import
fitz
# pyMuPDF库
def
calculate_overlapRatio_between_rect1_and_rect2
(
L1
:
float
,
U1
:
float
,
R1
:
float
,
D1
:
float
,
L2
:
float
,
U2
:
float
,
R2
:
float
,
D2
:
float
)
->
(
float
,
float
):
...
...
othoers/pdf2text_getNumberOfColumn.py
View file @
d5dbed73
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
from
typing
import
List
...
...
othoers/pdf2text_recogFootnoteLine.py
View file @
d5dbed73
import
re
from
pdf_tools
.libs
import
_is_in_or_part_overlap
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
_is_in_or_part_overlap
from
magic_pdf
.libs
import
fitz
import
collections
...
...
othoers/pdf2text_recogPara_v2.py
View file @
d5dbed73
...
...
@@ -11,8 +11,8 @@ import numpy as np
from
termcolor
import
cprint
from
pdf_tools
.libs
import
fitz
from
pdf_tools
.libs
import
NLPModels
from
magic_pdf
.libs
import
fitz
from
magic_pdf
.libs
import
NLPModels
if
sys
.
version_info
[
0
]
>=
3
:
...
...
othoers/pdf2text_recogTitle.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
# pyMuPDF库
from
magic_pdf
.libs.commons
import
fitz
# pyMuPDF库
def
parse_titles
(
page_ID
:
int
,
page
:
fitz
.
Page
,
json_from_DocXchain_obj
:
dict
,
exclude_bboxes
):
...
...
othoers/vali_bbox_sort.py
View file @
d5dbed73
...
...
@@ -2,7 +2,7 @@ import numpy as np
import
tqdm
import
json
from
validation
import
cal_edit_distance
,
format_gt_bbox
from
pdf_tools
.layout.layout_sort
import
sort_with_layout
from
magic_pdf
.layout.layout_sort
import
sort_with_layout
with
open
(
'/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json'
,
'r'
)
as
f
:
samples
=
json
.
load
(
f
)
...
...
pdf_tools/pre_proc/__init__.py
deleted
100644 → 0
View file @
7c7910e4
setup.py
View file @
d5dbed73
from
setuptools
import
setup
,
find_packages
setup
(
name
=
"Magic-PDF"
,
# 项目名
version
=
"0.1.0"
,
# 版本号
name
=
"Magic-PDF"
,
# 项目名
version
=
"0.1.0"
,
# 版本号
packages
=
find_packages
(),
# 包含所有的包
install_requires
=
[
'PyMuPDF>=1.23.25'
,
'boto3>=1.34.52'
,
...
...
tests/test_commons.py
View file @
d5dbed73
import
io
import
json
import
os
from
pdf_tools
.libs
import
fitz
from
magic_pdf
.libs
import
fitz
from
app.common.s3
import
get_s3_config
,
get_s3_client
from
pdf_tools
.libs
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
magic_pdf
.libs
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
loguru
import
logger
test_pdf_dir_path
=
"s3://llm-pdf-text/unittest/pdf/"
...
...
tests/test_metascan_classify/test_classify.py
View file @
d5dbed73
...
...
@@ -2,9 +2,9 @@ import os
import
pytest
from
pdf_tools
.filter
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
from
magic_pdf
.filter
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
classify_by_img_num
,
classify_by_text_layout
,
classify_by_img_narrow_strips
from
pdf_tools
.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
magic_pdf
.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
test.test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment