Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
8442ed39
Unverified
Commit
8442ed39
authored
Nov 19, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 19, 2024
Browse files
Merge pull request #1006 from icecraft/fix/data_compatiable
fix: using new data api replace old rw api
parents
bf7d2c4f
6a481320
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
67 additions
and
65 deletions
+67
-65
tests/unittest/test_tools/test_cli_dev.py
tests/unittest/test_tools/test_cli_dev.py
+46
-46
tests/unittest/test_tools/test_common.py
tests/unittest/test_tools/test_common.py
+21
-19
No files found.
tests/unittest/test_tools/test_cli_dev.py
View file @
8442ed39
import
tempfile
import
os
import
shutil
import
tempfile
from
click.testing
import
CliRunner
from
magic_pdf.tools
import
cli_dev
...
...
@@ -8,22 +9,22 @@ from magic_pdf.tools import cli_dev
def
test_cli_pdf
():
# setup
unitest_dir
=
"
/tmp/magic_pdf/unittest/tools
"
filename
=
"
cli_test_01
"
unitest_dir
=
'
/tmp/magic_pdf/unittest/tools
'
filename
=
'
cli_test_01
'
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"
/tmp/magic_pdf/unittest/tools
"
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
'
/tmp/magic_pdf/unittest/tools
'
)
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"
pdf
"
,
"
-p
"
,
"
tests/test_tools/assets/cli/pdf/cli_test_01.pdf
"
,
"
-j
"
,
"
tests/test_tools/assets/cli_dev/cli_test_01.model.json
"
,
"
-o
"
,
'
pdf
'
,
'
-p
'
,
'
tests/
unittest/
test_tools/assets/cli/pdf/cli_test_01.pdf
'
,
'
-j
'
,
'
tests/
unittest/
test_tools/assets/cli_dev/cli_test_01.model.json
'
,
'
-o
'
,
temp_output_dir
,
],
)
...
...
@@ -31,31 +32,30 @@ def test_cli_pdf():
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"
cli_test_01/auto
"
)
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
'
cli_test_01/auto
'
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
content_list.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
content_list.json
'
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
.md'
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
middle.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
middle.json
'
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
model.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
model.json
'
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
origin.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
origin.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
layout.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
layout.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
spans.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
spans.pdf
'
))
assert
r
.
st_size
>
4
00000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
...
...
@@ -63,26 +63,26 @@ def test_cli_pdf():
def
test_cli_jsonl
():
# setup
unitest_dir
=
"
/tmp/magic_pdf/unittest/tools
"
filename
=
"
cli_test_01
"
unitest_dir
=
'
/tmp/magic_pdf/unittest/tools
'
filename
=
'
cli_test_01
'
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"
/tmp/magic_pdf/unittest/tools
"
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
'
/tmp/magic_pdf/unittest/tools
'
)
def
mock_read_s3_path
(
s3path
):
with
open
(
s3path
,
"
rb
"
)
as
f
:
with
open
(
s3path
,
'
rb
'
)
as
f
:
return
f
.
read
()
cli_dev
.
read_s3_path
=
mock_read_s3_path
# mock
cli_dev
.
read_s3_path
=
mock_read_s3_path
# mock
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"
jsonl
"
,
"
-j
"
,
"
tests/test_tools/assets/cli_dev/cli_test_01.jsonl
"
,
"
-o
"
,
'
jsonl
'
,
'
-j
'
,
'
tests/
unittest/
test_tools/assets/cli_dev/cli_test_01.jsonl
'
,
'
-o
'
,
temp_output_dir
,
],
)
...
...
@@ -90,31 +90,31 @@ def test_cli_jsonl():
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"
cli_test_01/auto
"
)
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
'
cli_test_01/auto
'
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
content_list.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
content_list.json
'
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
.md
'
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
middle.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
middle.json
'
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
model.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
model.json
'
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
origin.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
origin.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
layout.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
layout.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
spans.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
spans.pdf
'
))
assert
r
.
st_size
>
4
00000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
tests/unittest/test_tools/test_common.py
View file @
8442ed39
import
tempfile
import
os
import
shutil
import
tempfile
import
pytest
from
magic_pdf.tools.common
import
do_parse
@
pytest
.
mark
.
parametrize
(
"
method
"
,
[
"
auto
"
,
"
txt
"
,
"
ocr
"
])
@
pytest
.
mark
.
parametrize
(
'
method
'
,
[
'
auto
'
,
'
txt
'
,
'
ocr
'
])
def
test_common_do_parse
(
method
):
import
magic_pdf.model
as
model_config
model_config
.
__use_inside_model__
=
True
# setup
unitest_dir
=
"
/tmp/magic_pdf/unittest/tools
"
filename
=
"
fake
"
unitest_dir
=
'
/tmp/magic_pdf/unittest/tools
'
filename
=
'
fake
'
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"
/tmp/magic_pdf/unittest/tools
"
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
'
/tmp/magic_pdf/unittest/tools
'
)
# run
with
open
(
"
tests/test_tools/assets/common/cli_test_01.pdf
"
,
"
rb
"
)
as
f
:
with
open
(
'
tests/
unittest/
test_tools/assets/common/cli_test_01.pdf
'
,
'
rb
'
)
as
f
:
bits
=
f
.
read
()
do_parse
(
temp_output_dir
,
filename
,
...
...
@@ -27,31 +29,31 @@ def test_common_do_parse(method):
f_dump_content_list
=
True
)
# check
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
f
"
fake/
{
method
}
"
)
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
f
'
fake/
{
method
}
'
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
content_list.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
content_list.json
'
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
.md
'
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
middle.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
middle.json
'
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
model.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
model.json
'
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
origin.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
origin.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
layout.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
layout.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
spans.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
spans.pdf
'
))
assert
r
.
st_size
>
4
00000
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
# teardown
shutil
.
rmtree
(
temp_output_dir
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment