Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
a65d6b53
Unverified
Commit
a65d6b53
authored
Nov 27, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 27, 2024
Browse files
Merge pull request #1112 from myhloli/dev
refactor(libs): remove unused imports and functions
parents
b6931171
2db3c263
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
11 additions
and
216 deletions
+11
-216
magic_pdf/filter/pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+3
-17
magic_pdf/libs/commons.py
magic_pdf/libs/commons.py
+0
-161
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+2
-3
magic_pdf/libs/pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+2
-1
magic_pdf/model/magic_model.py
magic_pdf/model/magic_model.py
+0
-30
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+2
-2
magic_pdf/rw/S3ReaderWriter.py
magic_pdf/rw/S3ReaderWriter.py
+1
-1
tests/unittest/test_metascan_classify/test_commons.py.bak
tests/unittest/test_metascan_classify/test_commons.py.bak
+1
-1
No files found.
magic_pdf/filter/pdf_meta_scan.py
View file @
a65d6b53
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
import
sys
from
collections
import
Counter
from
collections
import
Counter
import
click
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.libs.commons
import
fitz
,
get_top_percent_list
,
mymax
,
read_file
from
magic_pdf.libs.commons
import
get_top_percent_list
,
mymax
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.pdf_check
import
detect_invalid_chars
from
magic_pdf.libs.pdf_check
import
detect_invalid_chars
...
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
...
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
return
res
return
res
@
click
.
command
()
@
click
.
option
(
'--s3-pdf-path'
,
help
=
's3上pdf文件的路径'
)
@
click
.
option
(
'--s3-profile'
,
help
=
's3上的profile'
)
def
main
(
s3_pdf_path
:
str
,
s3_profile
:
str
):
""""""
try
:
file_content
=
read_file
(
s3_pdf_path
,
s3_profile
)
pdf_meta_scan
(
file_content
)
except
Exception
as
e
:
print
(
f
'ERROR:
{
s3_pdf_path
}
,
{
e
}
'
,
file
=
sys
.
stderr
)
logger
.
exception
(
e
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
()
pass
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
...
...
magic_pdf/libs/commons.py
View file @
a65d6b53
import
datetime
import
json
import
os
,
re
,
configparser
import
subprocess
import
time
import
boto3
from
loguru
import
logger
from
boto3.s3.transfer
import
TransferConfig
from
botocore.config
import
Config
import
fitz
# 1.23.9中已经切换到rebase
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
def
get_delta_time
(
input_time
):
return
round
(
time
.
time
()
-
input_time
,
2
)
def
join_path
(
*
args
):
def
join_path
(
*
args
):
return
'/'
.
join
(
str
(
s
).
rstrip
(
'/'
)
for
s
in
args
)
return
'/'
.
join
(
str
(
s
).
rstrip
(
'/'
)
for
s
in
args
)
#配置全局的errlog_path,方便demo同步引用
error_log_path
=
"s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path
=
"s3://llm-pdf-text/json_dump/"
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
def
get_top_percent_list
(
num_list
,
percent
):
def
get_top_percent_list
(
num_list
,
percent
):
"""
"""
获取列表中前百分之多少的元素
获取列表中前百分之多少的元素
...
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
...
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
return
top_percent_list
return
top_percent_list
def
formatted_time
(
time_stamp
):
dt_object
=
datetime
.
datetime
.
fromtimestamp
(
time_stamp
)
output_time
=
dt_object
.
strftime
(
"%Y-%m-%d-%H:%M:%S"
)
return
output_time
def
mymax
(
alist
:
list
):
def
mymax
(
alist
:
list
):
if
len
(
alist
)
==
0
:
if
len
(
alist
)
==
0
:
return
0
# 空是0, 0*0也是0大小q
return
0
# 空是0, 0*0也是0大小q
else
:
else
:
return
max
(
alist
)
return
max
(
alist
)
def
parse_aws_param
(
profile
):
if
isinstance
(
profile
,
str
):
# 解析配置文件
config_file
=
join_path
(
os
.
path
.
expanduser
(
"~"
),
".aws"
,
"config"
)
credentials_file
=
join_path
(
os
.
path
.
expanduser
(
"~"
),
".aws"
,
"credentials"
)
config
=
configparser
.
ConfigParser
()
config
.
read
(
credentials_file
)
config
.
read
(
config_file
)
# 获取 AWS 账户相关信息
ak
=
config
.
get
(
profile
,
"aws_access_key_id"
)
sk
=
config
.
get
(
profile
,
"aws_secret_access_key"
)
if
profile
==
"default"
:
s3_str
=
config
.
get
(
f
"
{
profile
}
"
,
"s3"
)
else
:
s3_str
=
config
.
get
(
f
"profile
{
profile
}
"
,
"s3"
)
end_match
=
re
.
search
(
"endpoint_url[\s]*=[\s]*([^\s
\n
]+)[\s
\n
]*$"
,
s3_str
,
re
.
MULTILINE
)
if
end_match
:
endpoint
=
end_match
.
group
(
1
)
else
:
raise
ValueError
(
f
"aws 配置文件中没有找到 endpoint_url"
)
style_match
=
re
.
search
(
"addressing_style[\s]*=[\s]*([^\s
\n
]+)[\s
\n
]*$"
,
s3_str
,
re
.
MULTILINE
)
if
style_match
:
addressing_style
=
style_match
.
group
(
1
)
else
:
addressing_style
=
"path"
elif
isinstance
(
profile
,
dict
):
ak
=
profile
[
"ak"
]
sk
=
profile
[
"sk"
]
endpoint
=
profile
[
"endpoint"
]
addressing_style
=
"auto"
return
ak
,
sk
,
endpoint
,
addressing_style
def
parse_bucket_key
(
s3_full_path
:
str
):
def
parse_bucket_key
(
s3_full_path
:
str
):
"""
"""
...
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
...
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
s3_full_path
=
s3_full_path
[
1
:]
s3_full_path
=
s3_full_path
[
1
:]
bucket
,
key
=
s3_full_path
.
split
(
"/"
,
1
)
bucket
,
key
=
s3_full_path
.
split
(
"/"
,
1
)
return
bucket
,
key
return
bucket
,
key
def
read_file
(
pdf_path
:
str
,
s3_profile
):
if
pdf_path
.
startswith
(
"s3://"
):
ak
,
sk
,
end_point
,
addressing_style
=
parse_aws_param
(
s3_profile
)
cli
=
boto3
.
client
(
service_name
=
"s3"
,
aws_access_key_id
=
ak
,
aws_secret_access_key
=
sk
,
endpoint_url
=
end_point
,
config
=
Config
(
s3
=
{
'addressing_style'
:
addressing_style
},
retries
=
{
'max_attempts'
:
10
,
'mode'
:
'standard'
}))
bucket_name
,
bucket_key
=
parse_bucket_key
(
pdf_path
)
res
=
cli
.
get_object
(
Bucket
=
bucket_name
,
Key
=
bucket_key
)
file_content
=
res
[
"Body"
].
read
()
return
file_content
else
:
with
open
(
pdf_path
,
"rb"
)
as
f
:
return
f
.
read
()
def
get_docx_model_output
(
pdf_model_output
,
page_id
):
model_output_json
=
pdf_model_output
[
page_id
]
return
model_output_json
def
list_dir
(
dir_path
:
str
,
s3_profile
:
str
):
"""
列出dir_path下的所有文件
"""
ret
=
[]
if
dir_path
.
startswith
(
"s3"
):
ak
,
sk
,
end_point
,
addressing_style
=
parse_aws_param
(
s3_profile
)
s3info
=
re
.
findall
(
r
"s3:\/\/([^\/]+)\/(.*)"
,
dir_path
)
bucket
,
path
=
s3info
[
0
][
0
],
s3info
[
0
][
1
]
try
:
cli
=
boto3
.
client
(
service_name
=
"s3"
,
aws_access_key_id
=
ak
,
aws_secret_access_key
=
sk
,
endpoint_url
=
end_point
,
config
=
Config
(
s3
=
{
'addressing_style'
:
addressing_style
}))
def
list_obj_scluster
():
marker
=
None
while
True
:
list_kwargs
=
dict
(
MaxKeys
=
1000
,
Bucket
=
bucket
,
Prefix
=
path
)
if
marker
:
list_kwargs
[
'Marker'
]
=
marker
response
=
cli
.
list_objects
(
**
list_kwargs
)
contents
=
response
.
get
(
"Contents"
,
[])
yield
from
contents
if
not
response
.
get
(
"IsTruncated"
)
or
len
(
contents
)
==
0
:
break
marker
=
contents
[
-
1
][
'Key'
]
for
info
in
list_obj_scluster
():
file_path
=
info
[
'Key'
]
#size = info['Size']
if
path
!=
""
:
afile
=
file_path
[
len
(
path
):]
if
afile
.
endswith
(
".json"
):
ret
.
append
(
f
"s3://
{
bucket
}
/
{
file_path
}
"
)
return
ret
except
Exception
as
e
:
logger
.
exception
(
e
)
exit
(
-
1
)
else
:
#本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
for
root
,
dirs
,
files
in
os
.
walk
(
dir_path
):
for
file
in
files
:
if
file
.
endswith
(
".json"
):
ret
.
append
(
join_path
(
root
,
file
))
ret
.
sort
()
return
ret
def
get_img_s3_client
(
save_path
:
str
,
image_s3_config
:
str
):
"""
"""
if
save_path
.
startswith
(
"s3://"
):
# 放这里是为了最少创建一个s3 client
ak
,
sk
,
end_point
,
addressing_style
=
parse_aws_param
(
image_s3_config
)
img_s3_client
=
boto3
.
client
(
service_name
=
"s3"
,
aws_access_key_id
=
ak
,
aws_secret_access_key
=
sk
,
endpoint_url
=
end_point
,
config
=
Config
(
s3
=
{
"addressing_style"
:
addressing_style
},
retries
=
{
'max_attempts'
:
5
,
'mode'
:
'standard'
}),
)
else
:
img_s3_client
=
None
return
img_s3_client
if
__name__
==
"__main__"
:
s3_path
=
"s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
s3_profile
=
"langchao"
ret
=
list_dir
(
s3_path
,
s3_profile
)
print
(
ret
)
\ No newline at end of file
magic_pdf/libs/draw_bbox.py
View file @
a65d6b53
import
fitz
from
magic_pdf.config.constants
import
CROSS_PAGE
from
magic_pdf.config.constants
import
CROSS_PAGE
from
magic_pdf.config.ocr_content_type
import
(
BlockType
,
CategoryId
,
from
magic_pdf.config.ocr_content_type
import
BlockType
,
CategoryId
,
ContentType
ContentType
)
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
...
...
magic_pdf/libs/pdf_image_tools.py
View file @
a65d6b53
from
io
import
BytesIO
from
io
import
BytesIO
import
cv2
import
cv2
import
fitz
import
numpy
as
np
import
numpy
as
np
from
PIL
import
Image
from
PIL
import
Image
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.libs.commons
import
fitz
,
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
from
magic_pdf.libs.hash_utils
import
compute_sha256
...
...
magic_pdf/model/magic_model.py
View file @
a65d6b53
import
enum
import
enum
import
json
from
magic_pdf.config.model_block_type
import
ModelBlockTypeEnum
from
magic_pdf.config.model_block_type
import
ModelBlockTypeEnum
from
magic_pdf.config.ocr_content_type
import
CategoryId
,
ContentType
from
magic_pdf.config.ocr_content_type
import
CategoryId
,
ContentType
from
magic_pdf.data.data_reader_writer
import
(
FileBasedDataReader
,
FileBasedDataWriter
)
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.boxbase
import
(
_is_in
,
_is_part_overlap
,
bbox_distance
,
from
magic_pdf.libs.boxbase
import
(
_is_in
,
_is_part_overlap
,
bbox_distance
,
bbox_relative_pos
,
box_area
,
calculate_iou
,
bbox_relative_pos
,
box_area
,
calculate_iou
,
calculate_overlap_area_in_bbox1_area_ratio
,
calculate_overlap_area_in_bbox1_area_ratio
,
get_overlap_area
)
get_overlap_area
)
from
magic_pdf.libs.commons
import
fitz
,
join_path
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
from
magic_pdf.libs.local_math
import
float_gt
from
magic_pdf.libs.local_math
import
float_gt
from
magic_pdf.pre_proc.remove_bbox_overlap
import
_remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
_remove_overlap_between_bbox
...
@@ -1048,29 +1044,3 @@ class MagicModel:
...
@@ -1048,29 +1044,3 @@ class MagicModel:
def
get_model_list
(
self
,
page_no
):
def
get_model_list
(
self
,
page_no
):
return
self
.
__model_list
[
page_no
]
return
self
.
__model_list
[
page_no
]
if
__name__
==
'__main__'
:
drw
=
FileBasedDataReader
(
r
'D:/project/20231108code-clean'
)
if
0
:
pdf_file_path
=
r
'linshixuqiu\19983-00.pdf'
model_file_path
=
r
'linshixuqiu\19983-00_new.json'
pdf_bytes
=
drw
.
read
(
pdf_file_path
)
model_json_txt
=
drw
.
read
(
model_file_path
).
decode
()
model_list
=
json
.
loads
(
model_json_txt
)
write_path
=
r
'D:\project\20231108code-clean\linshixuqiu\19983-00'
img_bucket_path
=
'imgs'
img_writer
=
FileBasedDataWriter
(
join_path
(
write_path
,
img_bucket_path
))
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
if
1
:
from
magic_pdf.data.dataset
import
PymuDocDataset
model_list
=
json
.
loads
(
drw
.
read
(
'/opt/data/pdf/20240418/j.chroma.2009.03.042.json'
)
)
pdf_bytes
=
drw
.
read
(
'/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf'
)
magic_model
=
MagicModel
(
model_list
,
PymuDocDataset
(
pdf_bytes
))
for
i
in
range
(
7
):
print
(
magic_model
.
get_imgs
(
i
))
magic_pdf/pdf_parse_union_core_v2.py
View file @
a65d6b53
...
@@ -5,6 +5,7 @@ import time
...
@@ -5,6 +5,7 @@ import time
from
typing
import
List
from
typing
import
List
import
torch
import
torch
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
...
@@ -12,7 +13,6 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
...
@@ -12,7 +13,6 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
from
magic_pdf.data.dataset
import
Dataset
,
PageableData
from
magic_pdf.data.dataset
import
Dataset
,
PageableData
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.libs.config_reader
import
get_local_layoutreader_model_dir
from
magic_pdf.libs.config_reader
import
get_local_layoutreader_model_dir
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.hash_utils
import
compute_md5
...
@@ -784,7 +784,7 @@ def pdf_parse_union(
...
@@ -784,7 +784,7 @@ def pdf_parse_union(
if
debug_mode
:
if
debug_mode
:
time_now
=
time
.
time
()
time_now
=
time
.
time
()
logger
.
info
(
logger
.
info
(
f
'page_id:
{
page_id
}
, last_page_cost_time:
{
get_delta_
time
(
start_time
)
}
'
f
'page_id:
{
page_id
}
, last_page_cost_time:
{
time
.
time
(
)
-
start_time
}
'
)
)
start_time
=
time_now
start_time
=
time_now
...
...
magic_pdf/rw/S3ReaderWriter.py
View file @
a65d6b53
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
parse_aws_param
,
parse_bucket_key
,
join_path
from
magic_pdf.libs.commons
import
parse_bucket_key
,
join_path
import
boto3
import
boto3
from
loguru
import
logger
from
loguru
import
logger
from
botocore.config
import
Config
from
botocore.config
import
Config
...
...
tests/unittest/test_metascan_classify/test_commons.py.bak
View file @
a65d6b53
...
@@ -2,10 +2,10 @@ import io
...
@@ -2,10 +2,10 @@ import io
import json
import json
import os
import os
import fitz
import boto3
import boto3
from botocore.config import Config
from botocore.config import Config
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment