Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
ded155ec
Unverified
Commit
ded155ec
authored
Nov 18, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 18, 2024
Browse files
Merge pull request #1001 from myhloli/dev
refactor(tests): extract common test utilities into test_commons.py
parents
7b197fe2
6906f72a
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
82 additions
and
2 deletions
+82
-2
tests/unittest/test_metascan_classify/test_classify.py
tests/unittest/test_metascan_classify/test_classify.py
+1
-1
tests/unittest/test_metascan_classify/test_commons.py
tests/unittest/test_metascan_classify/test_commons.py
+80
-0
tests/unittest/test_metascan_classify/test_meta_scan.py
tests/unittest/test_metascan_classify/test_meta_scan.py
+1
-1
No files found.
tests/unittest/test_metascan_classify/test_classify.py
View file @
ded155ec
...
...
@@ -5,7 +5,7 @@ import pytest
from
magic_pdf.filter.pdf_classify_by_type
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
classify_by_img_num
,
classify_by_text_layout
,
classify_by_img_narrow_strips
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
tests.
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
from
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
tests/unittest/test_metascan_classify/test_commons.py
0 → 100644
View file @
ded155ec
import
io
import
json
import
os
import
boto3
from
botocore.config
import
Config
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.config_reader
import
get_s3_config_dict
from
magic_pdf.libs.commons
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
loguru
import
logger
test_pdf_dir_path
=
"s3://llm-pdf-text/unittest/pdf/"
def
get_test_pdf_json
(
book_name
):
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
s3_config
=
get_s3_config_dict
(
json_path
)
file_content
=
read_file
(
json_path
,
s3_config
)
json_str
=
file_content
.
decode
(
'utf-8'
)
json_object
=
json
.
loads
(
json_str
)
return
json_object
def
read_test_file
(
book_name
):
test_pdf_path
=
join_path
(
test_pdf_dir_path
,
book_name
+
".pdf"
)
s3_config
=
get_s3_config_dict
(
test_pdf_path
)
try
:
file_content
=
read_file
(
test_pdf_path
,
s3_config
)
return
file_content
except
Exception
as
e
:
if
"NoSuchKey"
in
str
(
e
):
logger
.
warning
(
"File not found in test_pdf_path. Downloading from orig_s3_pdf_path."
)
try
:
json_object
=
get_test_pdf_json
(
book_name
)
orig_s3_pdf_path
=
json_object
.
get
(
'file_location'
)
s3_config
=
get_s3_config_dict
(
orig_s3_pdf_path
)
file_content
=
read_file
(
orig_s3_pdf_path
,
s3_config
)
s3_client
=
get_s3_client
(
test_pdf_path
)
bucket_name
,
bucket_key
=
parse_bucket_key
(
test_pdf_path
)
file_obj
=
io
.
BytesIO
(
file_content
)
s3_client
.
upload_fileobj
(
file_obj
,
bucket_name
,
bucket_key
)
return
file_content
except
Exception
as
e
:
logger
.
exception
(
e
)
else
:
logger
.
exception
(
e
)
def
get_docs_from_test_pdf
(
book_name
):
file_content
=
read_test_file
(
book_name
)
return
fitz
.
open
(
"pdf"
,
file_content
)
def
get_test_json_data
(
directory_path
,
json_file_name
):
with
open
(
os
.
path
.
join
(
directory_path
,
json_file_name
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
test_data
=
json
.
load
(
f
)
return
test_data
def
get_s3_client
(
path
):
s3_config
=
get_s3_config_dict
(
path
)
try
:
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
,
"mode"
:
"standard"
}),
)
except
:
# older boto3 do not support retries.mode param.
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
}),
)
tests/unittest/test_metascan_classify/test_meta_scan.py
View file @
ded155ec
...
...
@@ -2,7 +2,7 @@ import os
import
pytest
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_image_info
,
get_pdf_text_layout_per_page
,
get_language
from
tests.
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
from
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment