Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
6906f72a
Commit
6906f72a
authored
Nov 18, 2024
by
myhloli
Browse files
refactor(tests): extract common test utilities into test_commons.py
parent
90cf1082
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
82 additions
and
2 deletions
+82
-2
tests/unittest/test_metascan_classify/test_classify.py
tests/unittest/test_metascan_classify/test_classify.py
+1
-1
tests/unittest/test_metascan_classify/test_commons.py
tests/unittest/test_metascan_classify/test_commons.py
+80
-0
tests/unittest/test_metascan_classify/test_meta_scan.py
tests/unittest/test_metascan_classify/test_meta_scan.py
+1
-1
No files found.
tests/unittest/test_metascan_classify/test_classify.py
View file @
6906f72a
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
from
magic_pdf.filter.pdf_classify_by_type
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
from
magic_pdf.filter.pdf_classify_by_type
import
classify_by_area
,
classify_by_text_len
,
classify_by_avg_words
,
\
classify_by_img_num
,
classify_by_text_layout
,
classify_by_img_narrow_strips
classify_by_img_num
,
classify_by_text_layout
,
classify_by_img_narrow_strips
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_pdf_textlen_per_page
,
get_imgs_per_page
from
tests.
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
from
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
# 获取当前目录
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
tests/unittest/test_metascan_classify/test_commons.py
0 → 100644
View file @
6906f72a
import
io
import
json
import
os
import
boto3
from
botocore.config
import
Config
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.config_reader
import
get_s3_config_dict
from
magic_pdf.libs.commons
import
join_path
,
json_dump_path
,
read_file
,
parse_bucket_key
from
loguru
import
logger
test_pdf_dir_path
=
"s3://llm-pdf-text/unittest/pdf/"
def
get_test_pdf_json
(
book_name
):
json_path
=
join_path
(
json_dump_path
,
book_name
+
".json"
)
s3_config
=
get_s3_config_dict
(
json_path
)
file_content
=
read_file
(
json_path
,
s3_config
)
json_str
=
file_content
.
decode
(
'utf-8'
)
json_object
=
json
.
loads
(
json_str
)
return
json_object
def
read_test_file
(
book_name
):
test_pdf_path
=
join_path
(
test_pdf_dir_path
,
book_name
+
".pdf"
)
s3_config
=
get_s3_config_dict
(
test_pdf_path
)
try
:
file_content
=
read_file
(
test_pdf_path
,
s3_config
)
return
file_content
except
Exception
as
e
:
if
"NoSuchKey"
in
str
(
e
):
logger
.
warning
(
"File not found in test_pdf_path. Downloading from orig_s3_pdf_path."
)
try
:
json_object
=
get_test_pdf_json
(
book_name
)
orig_s3_pdf_path
=
json_object
.
get
(
'file_location'
)
s3_config
=
get_s3_config_dict
(
orig_s3_pdf_path
)
file_content
=
read_file
(
orig_s3_pdf_path
,
s3_config
)
s3_client
=
get_s3_client
(
test_pdf_path
)
bucket_name
,
bucket_key
=
parse_bucket_key
(
test_pdf_path
)
file_obj
=
io
.
BytesIO
(
file_content
)
s3_client
.
upload_fileobj
(
file_obj
,
bucket_name
,
bucket_key
)
return
file_content
except
Exception
as
e
:
logger
.
exception
(
e
)
else
:
logger
.
exception
(
e
)
def
get_docs_from_test_pdf
(
book_name
):
file_content
=
read_test_file
(
book_name
)
return
fitz
.
open
(
"pdf"
,
file_content
)
def
get_test_json_data
(
directory_path
,
json_file_name
):
with
open
(
os
.
path
.
join
(
directory_path
,
json_file_name
),
"r"
,
encoding
=
'utf-8'
)
as
f
:
test_data
=
json
.
load
(
f
)
return
test_data
def
get_s3_client
(
path
):
s3_config
=
get_s3_config_dict
(
path
)
try
:
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
,
"mode"
:
"standard"
}),
)
except
:
# older boto3 do not support retries.mode param.
return
boto3
.
client
(
"s3"
,
aws_access_key_id
=
s3_config
[
"ak"
],
aws_secret_access_key
=
s3_config
[
"sk"
],
endpoint_url
=
s3_config
[
"endpoint"
],
config
=
Config
(
s3
=
{
"addressing_style"
:
"path"
},
retries
=
{
"max_attempts"
:
8
}),
)
tests/unittest/test_metascan_classify/test_meta_scan.py
View file @
6906f72a
...
@@ -2,7 +2,7 @@ import os
...
@@ -2,7 +2,7 @@ import os
import
pytest
import
pytest
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_image_info
,
get_pdf_text_layout_per_page
,
get_language
from
magic_pdf.filter.pdf_meta_scan
import
get_pdf_page_size_pts
,
get_image_info
,
get_pdf_text_layout_per_page
,
get_language
from
tests.
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
from
test_commons
import
get_docs_from_test_pdf
,
get_test_json_data
# 获取当前目录
# 获取当前目录
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment