Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
7b197fe2
Unverified
Commit
7b197fe2
authored
Nov 18, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 18, 2024
Browse files
Merge pull request #998 from myhloli/dev
test(unitest): Restore unit test cases
parents
8e981b3a
90cf1082
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
177 additions
and
0 deletions
+177
-0
tests/unittest/test_tools/test_cli_dev.py
tests/unittest/test_tools/test_cli_dev.py
+120
-0
tests/unittest/test_tools/test_common.py
tests/unittest/test_tools/test_common.py
+57
-0
No files found.
tests/unittest/test_tools/test_cli_dev.py
0 → 100644
View file @
7b197fe2
import
tempfile
import
os
import
shutil
from
click.testing
import
CliRunner
from
magic_pdf.tools
import
cli_dev
def
test_cli_pdf
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"cli_test_01"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"pdf"
,
"-p"
,
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf"
,
"-j"
,
"tests/test_tools/assets/cli_dev/cli_test_01.model.json"
,
"-o"
,
temp_output_dir
,
],
)
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
def
test_cli_jsonl
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"cli_test_01"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
def
mock_read_s3_path
(
s3path
):
with
open
(
s3path
,
"rb"
)
as
f
:
return
f
.
read
()
cli_dev
.
read_s3_path
=
mock_read_s3_path
# mock
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"jsonl"
,
"-j"
,
"tests/test_tools/assets/cli_dev/cli_test_01.jsonl"
,
"-o"
,
temp_output_dir
,
],
)
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
tests/unittest/test_tools/test_common.py
0 → 100644
View file @
7b197fe2
import
tempfile
import
os
import
shutil
import
pytest
from
magic_pdf.tools.common
import
do_parse
@
pytest
.
mark
.
parametrize
(
"method"
,
[
"auto"
,
"txt"
,
"ocr"
])
def
test_common_do_parse
(
method
):
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"fake"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
with
open
(
"tests/test_tools/assets/common/cli_test_01.pdf"
,
"rb"
)
as
f
:
bits
=
f
.
read
()
do_parse
(
temp_output_dir
,
filename
,
bits
,
[],
method
,
False
,
f_dump_content_list
=
True
)
# check
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
f
"fake/
{
method
}
"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
# teardown
shutil
.
rmtree
(
temp_output_dir
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment