Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
6a481320
Commit
6a481320
authored
Nov 18, 2024
by
icecraft
Browse files
fix: using new data api replace old rw api
parent
7b197fe2
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
67 additions
and
65 deletions
+67
-65
tests/unittest/test_tools/test_cli_dev.py
tests/unittest/test_tools/test_cli_dev.py
+46
-46
tests/unittest/test_tools/test_common.py
tests/unittest/test_tools/test_common.py
+21
-19
No files found.
tests/unittest/test_tools/test_cli_dev.py
View file @
6a481320
import
tempfile
import
os
import
shutil
import
tempfile
from
click.testing
import
CliRunner
from
magic_pdf.tools
import
cli_dev
...
...
@@ -8,22 +9,22 @@ from magic_pdf.tools import cli_dev
def
test_cli_pdf
():
# setup
unitest_dir
=
"
/tmp/magic_pdf/unittest/tools
"
filename
=
"
cli_test_01
"
unitest_dir
=
'
/tmp/magic_pdf/unittest/tools
'
filename
=
'
cli_test_01
'
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"
/tmp/magic_pdf/unittest/tools
"
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
'
/tmp/magic_pdf/unittest/tools
'
)
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"
pdf
"
,
"
-p
"
,
"
tests/test_tools/assets/cli/pdf/cli_test_01.pdf
"
,
"
-j
"
,
"
tests/test_tools/assets/cli_dev/cli_test_01.model.json
"
,
"
-o
"
,
'
pdf
'
,
'
-p
'
,
'
tests/
unittest/
test_tools/assets/cli/pdf/cli_test_01.pdf
'
,
'
-j
'
,
'
tests/
unittest/
test_tools/assets/cli_dev/cli_test_01.model.json
'
,
'
-o
'
,
temp_output_dir
,
],
)
...
...
@@ -31,31 +32,30 @@ def test_cli_pdf():
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"
cli_test_01/auto
"
)
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
'
cli_test_01/auto
'
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
content_list.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
content_list.json
'
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
.md'
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
middle.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
middle.json
'
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
model.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
model.json
'
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
origin.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
origin.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
layout.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
layout.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
spans.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
spans.pdf
'
))
assert
r
.
st_size
>
4
00000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
...
...
@@ -63,26 +63,26 @@ def test_cli_pdf():
def
test_cli_jsonl
():
# setup
unitest_dir
=
"
/tmp/magic_pdf/unittest/tools
"
filename
=
"
cli_test_01
"
unitest_dir
=
'
/tmp/magic_pdf/unittest/tools
'
filename
=
'
cli_test_01
'
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"
/tmp/magic_pdf/unittest/tools
"
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
'
/tmp/magic_pdf/unittest/tools
'
)
def
mock_read_s3_path
(
s3path
):
with
open
(
s3path
,
"
rb
"
)
as
f
:
with
open
(
s3path
,
'
rb
'
)
as
f
:
return
f
.
read
()
cli_dev
.
read_s3_path
=
mock_read_s3_path
# mock
cli_dev
.
read_s3_path
=
mock_read_s3_path
# mock
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"
jsonl
"
,
"
-j
"
,
"
tests/test_tools/assets/cli_dev/cli_test_01.jsonl
"
,
"
-o
"
,
'
jsonl
'
,
'
-j
'
,
'
tests/
unittest/
test_tools/assets/cli_dev/cli_test_01.jsonl
'
,
'
-o
'
,
temp_output_dir
,
],
)
...
...
@@ -90,31 +90,31 @@ def test_cli_jsonl():
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"
cli_test_01/auto
"
)
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
'
cli_test_01/auto
'
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
content_list.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
content_list.json
'
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
.md
'
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
middle.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
middle.json
'
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
model.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
model.json
'
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
origin.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
origin.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
layout.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
layout.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
spans.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
spans.pdf
'
))
assert
r
.
st_size
>
4
00000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
is
True
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
tests/unittest/test_tools/test_common.py
View file @
6a481320
import
tempfile
import
os
import
shutil
import
tempfile
import
pytest
from
magic_pdf.tools.common
import
do_parse
@
pytest
.
mark
.
parametrize
(
"
method
"
,
[
"
auto
"
,
"
txt
"
,
"
ocr
"
])
@
pytest
.
mark
.
parametrize
(
'
method
'
,
[
'
auto
'
,
'
txt
'
,
'
ocr
'
])
def
test_common_do_parse
(
method
):
import
magic_pdf.model
as
model_config
model_config
.
__use_inside_model__
=
True
# setup
unitest_dir
=
"
/tmp/magic_pdf/unittest/tools
"
filename
=
"
fake
"
unitest_dir
=
'
/tmp/magic_pdf/unittest/tools
'
filename
=
'
fake
'
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"
/tmp/magic_pdf/unittest/tools
"
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
'
/tmp/magic_pdf/unittest/tools
'
)
# run
with
open
(
"
tests/test_tools/assets/common/cli_test_01.pdf
"
,
"
rb
"
)
as
f
:
with
open
(
'
tests/
unittest/
test_tools/assets/common/cli_test_01.pdf
'
,
'
rb
'
)
as
f
:
bits
=
f
.
read
()
do_parse
(
temp_output_dir
,
filename
,
...
...
@@ -27,31 +29,31 @@ def test_common_do_parse(method):
f_dump_content_list
=
True
)
# check
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
f
"
fake/
{
method
}
"
)
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
f
'
fake/
{
method
}
'
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
content_list.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
content_list.json
'
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
.md
'
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
middle.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
middle.json
'
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
model.json
"
))
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
model.json
'
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
origin.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
origin.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
layout.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
layout.pdf
'
))
assert
r
.
st_size
>
4
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"
spans.pdf
"
))
assert
r
.
st_size
>
5
00000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
'
{
filename
}
_
spans.pdf
'
))
assert
r
.
st_size
>
4
00000
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"
images
"
))
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
'
images
'
))
# teardown
shutil
.
rmtree
(
temp_output_dir
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment