Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b8adb630
Commit
b8adb630
authored
Aug 05, 2024
by
liukaiwen
Browse files
Merge branch 'master' of github.com:papayalove/Magic-PDF
# Conflicts: # docs/how_to_download_models_zh_cn.md
parents
6de68f06
52069612
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
172 additions
and
0 deletions
+172
-0
tests/test_tools/test_cli_dev.py
tests/test_tools/test_cli_dev.py
+120
-0
tests/test_tools/test_common.py
tests/test_tools/test_common.py
+52
-0
No files found.
tests/test_tools/test_cli_dev.py
0 → 100644
View file @
b8adb630
import
tempfile
import
os
import
shutil
from
click.testing
import
CliRunner
from
magic_pdf.tools
import
cli_dev
def
test_cli_pdf
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"cli_test_01"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"pdf"
,
"-p"
,
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf"
,
"-j"
,
"tests/test_tools/assets/cli_dev/cli_test_01.model.json"
,
"-o"
,
temp_output_dir
,
],
)
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
def
test_cli_jsonl
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"cli_test_01"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
def
mock_read_s3_path
(
s3path
):
with
open
(
s3path
,
"rb"
)
as
f
:
return
f
.
read
()
cli_dev
.
read_s3_path
=
mock_read_s3_path
# mock
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"jsonl"
,
"-j"
,
"tests/test_tools/assets/cli_dev/cli_test_01.jsonl"
,
"-o"
,
temp_output_dir
,
],
)
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
tests/test_tools/test_common.py
0 → 100644
View file @
b8adb630
import
tempfile
import
os
import
shutil
import
pytest
from
magic_pdf.tools.common
import
do_parse
@
pytest
.
mark
.
parametrize
(
"method"
,
[
"auto"
,
"txt"
,
"ocr"
])
def
test_common_do_parse
(
method
):
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"fake"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
with
open
(
"tests/test_tools/assets/common/cli_test_01.pdf"
,
"rb"
)
as
f
:
bits
=
f
.
read
()
do_parse
(
temp_output_dir
,
filename
,
bits
,
[],
method
,
f_dump_content_list
=
True
)
# check
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
f
"fake/
{
method
}
"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
# teardown
shutil
.
rmtree
(
temp_output_dir
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment