Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
6be1453c
Commit
6be1453c
authored
Jul 16, 2025
by
Sidney233
Browse files
test: Update test_e2e.py
parent
da048cbf
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
276 additions
and
312 deletions
+276
-312
mkdocs.yml
mkdocs.yml
+3
-0
pyproject.toml
pyproject.toml
+7
-3
tests/unittest/pdfs/test.pdf
tests/unittest/pdfs/test.pdf
+0
-0
tests/unittest/test_e2e.py
tests/unittest/test_e2e.py
+266
-309
No files found.
mkdocs.yml
View file @
6be1453c
...
...
@@ -78,6 +78,9 @@ plugins:
-
search
-
i18n
:
docs_structure
:
folder
fallback_to_default
:
true
reconfigure_material
:
true
reconfigure_search
:
true
languages
:
-
locale
:
en
default
:
true
...
...
pyproject.toml
View file @
6be1453c
...
...
@@ -43,7 +43,8 @@ test = [
"pytest"
,
"pytest-cov"
,
"coverage"
,
"beautifulsoup4"
"beautifulsoup4"
,
"fuzzywuzzy"
]
vlm
=
[
"transformers>=4.51.1"
,
...
...
@@ -150,7 +151,11 @@ omit = [
"*/cli_parser.py"
,
"*/run_async.py"
]
[tool.coverage.html]
directory
=
"htmlcov"
[tool.coverage.report]
exclude_also
=
[
'def __repr__'
,
'if self.debug:'
,
...
...
@@ -162,5 +167,4 @@ exclude_also = [
'if TYPE_CHECKING:'
,
'class .*\bProtocol\):'
,
'@(abc\.)?abstractmethod'
,
]
directory
=
"htmlcov"
\ No newline at end of file
]
\ No newline at end of file
tests/unittest/pdfs/test.pdf
View file @
6be1453c
No preview for this file type
tests/unittest/test_e2e.py
View file @
6be1453c
...
...
@@ -3,17 +3,15 @@ import copy
import
json
import
os
from
pathlib
import
Path
from
cryptography.hazmat.backends.openssl
import
backend
from
loguru
import
logger
from
bs4
import
BeautifulSoup
from
fuzzywuzzy
import
fuzz
from
mineru.cli.common
import
(
convert_pdf_bytes_to_bytes_by_pypdfium2
,
prepare_env
,
read_fn
,
)
from
mineru.data.data_reader_writer
import
FileBasedDataWriter
from
mineru.utils.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
mineru.utils.enum_class
import
MakeMode
from
mineru.backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
mineru.backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
...
...
@@ -24,313 +22,272 @@ from mineru.backend.pipeline.model_json_to_middle_json import (
result_to_middle_json
as
pipeline_result_to_middle_json
,
)
from
mineru.backend.vlm.vlm_middle_json_mkcontent
import
union_make
as
vlm_union_make
from
mineru.utils.models_download_utils
import
auto_download_and_get_model_root_path
class
TestE2E
:
def
test_pipeline_with_two_config
(
self
):
def
do_parse
(
output_dir
,
# Output directory for storing parsing results
pdf_file_names
:
list
[
str
],
# List of PDF file names to be parsed
pdf_bytes_list
:
list
[
bytes
],
# List of PDF bytes to be parsed
p_lang_list
:
list
[
str
],
# List of languages for each PDF, default is 'ch' (Chinese)
parse_method
=
"auto"
,
# The method for parsing PDF, default is 'auto'
formula_enable
=
True
,
# Enable formula parsing
table_enable
=
True
,
# Enable table parsing
f_draw_layout_bbox
=
True
,
# Whether to draw layout bounding boxes
f_draw_span_bbox
=
True
,
# Whether to draw span bounding boxes
f_dump_md
=
True
,
# Whether to dump markdown files
f_dump_middle_json
=
True
,
# Whether to dump middle JSON files
f_dump_model_output
=
True
,
# Whether to dump model output files
f_dump_orig_pdf
=
True
,
# Whether to dump original PDF files
f_dump_content_list
=
True
,
# Whether to dump content list files
f_make_md_mode
=
MakeMode
.
MM_MD
,
# The mode for making markdown content, default is MM_MD
start_page_id
=
0
,
# Start page ID for parsing, default is 0
end_page_id
=
None
,
# End page ID for parsing, default is None (parse all pages until the end of the document)
):
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
new_pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
,
start_page_id
,
end_page_id
)
pdf_bytes_list
[
idx
]
=
new_pdf_bytes
(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
,
)
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
pdf_file_name
=
pdf_file_names
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
images_list
=
all_image_lists
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
formula_enable
,
)
pdf_info
=
middle_json
[
"pdf_info"
]
pdf_bytes
=
pdf_bytes_list
[
idx
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
,
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
,
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
"
{
pdf_file_name
}
_origin.pdf"
,
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
pipeline_union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
pipeline_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_model_output
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model.json"
,
json
.
dumps
(
model_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
def
parse_doc
(
path_list
:
list
[
Path
],
output_dir
,
lang
=
"ch"
,
method
=
"auto"
,
start_page_id
=
0
,
end_page_id
=
None
,
):
file_name_list
=
[]
pdf_bytes_list
=
[]
lang_list
=
[]
for
path
in
path_list
:
file_name
=
str
(
Path
(
path
).
stem
)
pdf_bytes
=
read_fn
(
path
)
file_name_list
.
append
(
file_name
)
pdf_bytes_list
.
append
(
pdf_bytes
)
lang_list
.
append
(
lang
)
# 运行两次 do_parse,分别是开启公式和表格解析和不开启
do_parse
(
output_dir
=
output_dir
,
pdf_file_names
=
file_name_list
,
pdf_bytes_list
=
pdf_bytes_list
,
p_lang_list
=
lang_list
,
parse_method
=
method
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
)
do_parse
(
output_dir
=
output_dir
,
pdf_file_names
=
file_name_list
,
pdf_bytes_list
=
pdf_bytes_list
,
p_lang_list
=
lang_list
,
parse_method
=
method
,
table_enable
=
False
,
formula_enable
=
False
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
)
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
pdf_suffixes
=
[
".pdf"
]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
doc_path_list
=
[]
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
doc_path_list
.
append
(
doc_path
)
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc
(
doc_path_list
,
output_dir
)
def
test_vlm_transformers_with_default_config
(
self
):
def
do_parse
(
output_dir
,
# Output directory for storing parsing results
pdf_file_names
:
list
[
str
],
# List of PDF file names to be parsed
pdf_bytes_list
:
list
[
bytes
],
# List of PDF bytes to be parsed
server_url
=
None
,
# Server URL for vlm-sglang-client backend
f_draw_layout_bbox
=
True
,
# Whether to draw layout bounding boxes
f_dump_md
=
True
,
# Whether to dump markdown files
f_dump_middle_json
=
True
,
# Whether to dump middle JSON files
f_dump_model_output
=
True
,
# Whether to dump model output files
f_dump_orig_pdf
=
True
,
# Whether to dump original PDF files
f_dump_content_list
=
True
,
# Whether to dump content list files
f_make_md_mode
=
MakeMode
.
MM_MD
,
# The mode for making markdown content, default is MM_MD
start_page_id
=
0
,
# Start page ID for parsing, default is 0
end_page_id
=
None
,
# End page ID for parsing, default is None (parse all pages until the end of the document)
):
backend
=
"transformers"
f_draw_span_bbox
=
False
parse_method
=
"vlm"
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
pdf_file_name
=
pdf_file_names
[
idx
]
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
,
start_page_id
,
end_page_id
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
def
test_pipeline_with_two_config
():
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
pdf_suffixes
=
[
".pdf"
]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
doc_path_list
=
[]
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
doc_path_list
.
append
(
doc_path
)
os
.
environ
[
"MINERU_MODEL_SOURCE"
]
=
"modelscope"
pdf_file_names
=
[]
pdf_bytes_list
=
[]
p_lang_list
=
[]
for
path
in
doc_path_list
:
file_name
=
str
(
Path
(
path
).
stem
)
pdf_bytes
=
read_fn
(
path
)
pdf_file_names
.
append
(
file_name
)
pdf_bytes_list
.
append
(
pdf_bytes
)
p_lang_list
.
append
(
"en"
)
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
new_pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
)
pdf_bytes_list
[
idx
]
=
new_pdf_bytes
# 获取 pipline 分析结果, 分别测试 txt 和 ocr 两种解析方法的结果
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
(
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
"txt"
,
)
)
write_infer_result
(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
,
pdf_file_names
,
output_dir
,
parse_method
=
"txt"
,
)
assert_content
(
"tests/unittest/output/test/txt/test_content_list.json"
)
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
(
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
"ocr"
,
)
)
write_infer_result
(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
,
pdf_file_names
,
output_dir
,
parse_method
=
"ocr"
,
)
assert_content
(
"tests/unittest/output/test/ocr/test_content_list.json"
)
def
test_vlm_transformers_with_default_config
():
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
pdf_suffixes
=
[
".pdf"
]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
doc_path_list
=
[]
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
doc_path_list
.
append
(
doc_path
)
os
.
environ
[
"MINERU_MODEL_SOURCE"
]
=
"modelscope"
pdf_file_names
=
[]
pdf_bytes_list
=
[]
p_lang_list
=
[]
for
path
in
doc_path_list
:
file_name
=
str
(
Path
(
path
).
stem
)
pdf_bytes
=
read_fn
(
path
)
pdf_file_names
.
append
(
file_name
)
pdf_bytes_list
.
append
(
pdf_bytes
)
p_lang_list
.
append
(
"en"
)
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
pdf_file_name
=
pdf_file_names
[
idx
]
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
=
"vlm"
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
middle_json
,
infer_result
=
vlm_doc_analyze
(
pdf_bytes
,
image_writer
=
image_writer
,
backend
=
"transformers"
)
pdf_info
=
middle_json
[
"pdf_info"
]
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
vlm_union_make
(
pdf_info
,
MakeMode
.
MM_MD
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
content_list
=
vlm_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
model_output
=
(
"
\n
"
+
"-"
*
50
+
"
\n
"
).
join
(
infer_result
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model_output.txt"
,
model_output
,
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
assert_content
(
"tests/unittest/output/test/vlm/test_content_list.json"
)
def
write_infer_result
(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
,
pdf_file_names
,
output_dir
,
parse_method
,
):
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
pdf_file_name
=
pdf_file_names
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
images_list
=
all_image_lists
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
True
,
)
pdf_info
=
middle_json
[
"pdf_info"
]
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
# 写入 md 文件
md_content_str
=
pipeline_union_make
(
pdf_info
,
MakeMode
.
MM_MD
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
content_list
=
pipeline_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model.json"
,
json
.
dumps
(
model_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
def
validate_html
(
html_content
):
try
:
soup
=
BeautifulSoup
(
html_content
,
"html.parser"
)
return
True
except
Exception
as
e
:
return
False
def
assert_content
(
content_path
):
content_list
=
[]
with
open
(
content_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
content_list
=
json
.
load
(
file
)
type_set
=
set
()
for
content_dict
in
content_list
:
match
content_dict
[
"type"
]:
# 图片校验,只校验 Caption
case
"image"
:
type_set
.
add
(
"image"
)
assert
(
content_dict
[
"image_caption"
][
0
].
strip
().
lower
()
==
"Figure 1: Figure Caption"
.
lower
()
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
middle_json
,
infer_result
=
vlm_doc_analyze
(
pdf_bytes
,
image_writer
=
image_writer
,
backend
=
backend
,
server_url
=
server_url
,
# 表格校验,校验 Caption,表格格式和表格内容
case
"table"
:
type_set
.
add
(
"table"
)
assert
(
content_dict
[
"table_caption"
][
0
].
strip
().
lower
()
==
"Table 1: Table Caption"
.
lower
()
)
pdf_info
=
middle_json
[
"pdf_info"
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
"
{
pdf_file_name
}
_origin.pdf"
,
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
vlm_union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
assert
validate_html
(
content_dict
[
"table_body"
])
target_str_list
=
[
"Linear Regression"
,
"0.98740"
,
"1321.2"
,
"2-order Polynomial"
,
"0.99906"
,
"26.4"
,
"3-order Polynomial"
,
"0.99913"
,
"101.2"
,
"4-order Polynomial"
,
"0.99914"
,
"94.1"
,
"Gray Prediction"
,
"0.00617"
,
"687"
,
]
correct_count
=
0
for
target_str
in
target_str_list
:
if
target_str
in
content_dict
[
"table_body"
]:
correct_count
+=
1
assert
correct_count
>
0.9
*
len
(
target_str_list
)
# 公式校验,检测是否含有公式元素
case
"equation"
:
type_set
.
add
(
"equation"
)
target_str_list
=
[
"$$"
,
"lambda"
,
"frac"
,
"bar"
]
for
target_str
in
target_str_list
:
assert
target_str
in
content_dict
[
"text"
]
# 文本校验,文本相似度超过90
case
"text"
:
type_set
.
add
(
"text"
)
assert
(
fuzz
.
ratio
(
content_dict
[
"text"
],
"Trump graduated from the Wharton School of the University of Pennsylvania with a bachelor's degree in 1968. He became president of his father's real estate business in 1971 and renamed it The Trump Organization."
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
vlm_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_model_output
:
model_output
=
(
"
\n
"
+
"-"
*
50
+
"
\n
"
).
join
(
infer_result
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model_output.txt"
,
model_output
,
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
def
parse_doc
(
path_list
:
list
[
Path
],
output_dir
,
lang
=
"ch"
,
server_url
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
):
file_name_list
=
[]
pdf_bytes_list
=
[]
lang_list
=
[]
for
path
in
path_list
:
file_name
=
str
(
Path
(
path
).
stem
)
pdf_bytes
=
read_fn
(
path
)
file_name_list
.
append
(
file_name
)
pdf_bytes_list
.
append
(
pdf_bytes
)
lang_list
.
append
(
lang
)
do_parse
(
output_dir
=
output_dir
,
pdf_file_names
=
file_name_list
,
pdf_bytes_list
=
pdf_bytes_list
,
server_url
=
server_url
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
)
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
pdf_suffixes
=
[
".pdf"
]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
doc_path_list
=
[]
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
doc_path_list
.
append
(
doc_path
)
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc
(
doc_path_list
,
output_dir
)
>
90
)
assert
len
(
type_set
)
>=
4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment