Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
6be1453c
Commit
6be1453c
authored
Jul 16, 2025
by
Sidney233
Browse files
test: Update test_e2e.py
parent
da048cbf
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
276 additions
and
312 deletions
+276
-312
mkdocs.yml
mkdocs.yml
+3
-0
pyproject.toml
pyproject.toml
+7
-3
tests/unittest/pdfs/test.pdf
tests/unittest/pdfs/test.pdf
+0
-0
tests/unittest/test_e2e.py
tests/unittest/test_e2e.py
+266
-309
No files found.
mkdocs.yml
View file @
6be1453c
...
@@ -78,6 +78,9 @@ plugins:
...
@@ -78,6 +78,9 @@ plugins:
-
search
-
search
-
i18n
:
-
i18n
:
docs_structure
:
folder
docs_structure
:
folder
fallback_to_default
:
true
reconfigure_material
:
true
reconfigure_search
:
true
languages
:
languages
:
-
locale
:
en
-
locale
:
en
default
:
true
default
:
true
...
...
pyproject.toml
View file @
6be1453c
...
@@ -43,7 +43,8 @@ test = [
...
@@ -43,7 +43,8 @@ test = [
"pytest"
,
"pytest"
,
"pytest-cov"
,
"pytest-cov"
,
"coverage"
,
"coverage"
,
"beautifulsoup4"
"beautifulsoup4"
,
"fuzzywuzzy"
]
]
vlm
=
[
vlm
=
[
"transformers>=4.51.1"
,
"transformers>=4.51.1"
,
...
@@ -150,7 +151,11 @@ omit = [
...
@@ -150,7 +151,11 @@ omit = [
"*/cli_parser.py"
,
"*/cli_parser.py"
,
"*/run_async.py"
"*/run_async.py"
]
]
[tool.coverage.html]
[tool.coverage.html]
directory
=
"htmlcov"
[tool.coverage.report]
exclude_also
=
[
exclude_also
=
[
'def __repr__'
,
'def __repr__'
,
'if self.debug:'
,
'if self.debug:'
,
...
@@ -162,5 +167,4 @@ exclude_also = [
...
@@ -162,5 +167,4 @@ exclude_also = [
'if TYPE_CHECKING:'
,
'if TYPE_CHECKING:'
,
'class .*\bProtocol\):'
,
'class .*\bProtocol\):'
,
'@(abc\.)?abstractmethod'
,
'@(abc\.)?abstractmethod'
,
]
]
directory
=
"htmlcov"
\ No newline at end of file
\ No newline at end of file
tests/unittest/pdfs/test.pdf
View file @
6be1453c
No preview for this file type
tests/unittest/test_e2e.py
View file @
6be1453c
...
@@ -3,17 +3,15 @@ import copy
...
@@ -3,17 +3,15 @@ import copy
import
json
import
json
import
os
import
os
from
pathlib
import
Path
from
pathlib
import
Path
from
cryptography.hazmat.backends.openssl
import
backend
from
loguru
import
logger
from
loguru
import
logger
from
bs4
import
BeautifulSoup
from
fuzzywuzzy
import
fuzz
from
mineru.cli.common
import
(
from
mineru.cli.common
import
(
convert_pdf_bytes_to_bytes_by_pypdfium2
,
convert_pdf_bytes_to_bytes_by_pypdfium2
,
prepare_env
,
prepare_env
,
read_fn
,
read_fn
,
)
)
from
mineru.data.data_reader_writer
import
FileBasedDataWriter
from
mineru.data.data_reader_writer
import
FileBasedDataWriter
from
mineru.utils.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
mineru.utils.enum_class
import
MakeMode
from
mineru.utils.enum_class
import
MakeMode
from
mineru.backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
mineru.backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
mineru.backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
from
mineru.backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
...
@@ -24,313 +22,272 @@ from mineru.backend.pipeline.model_json_to_middle_json import (
...
@@ -24,313 +22,272 @@ from mineru.backend.pipeline.model_json_to_middle_json import (
result_to_middle_json
as
pipeline_result_to_middle_json
,
result_to_middle_json
as
pipeline_result_to_middle_json
,
)
)
from
mineru.backend.vlm.vlm_middle_json_mkcontent
import
union_make
as
vlm_union_make
from
mineru.backend.vlm.vlm_middle_json_mkcontent
import
union_make
as
vlm_union_make
from
mineru.utils.models_download_utils
import
auto_download_and_get_model_root_path
class
TestE2E
:
def
test_pipeline_with_two_config
(
self
):
def
do_parse
(
output_dir
,
# Output directory for storing parsing results
pdf_file_names
:
list
[
str
],
# List of PDF file names to be parsed
pdf_bytes_list
:
list
[
bytes
],
# List of PDF bytes to be parsed
p_lang_list
:
list
[
str
],
# List of languages for each PDF, default is 'ch' (Chinese)
parse_method
=
"auto"
,
# The method for parsing PDF, default is 'auto'
formula_enable
=
True
,
# Enable formula parsing
table_enable
=
True
,
# Enable table parsing
f_draw_layout_bbox
=
True
,
# Whether to draw layout bounding boxes
f_draw_span_bbox
=
True
,
# Whether to draw span bounding boxes
f_dump_md
=
True
,
# Whether to dump markdown files
f_dump_middle_json
=
True
,
# Whether to dump middle JSON files
f_dump_model_output
=
True
,
# Whether to dump model output files
f_dump_orig_pdf
=
True
,
# Whether to dump original PDF files
f_dump_content_list
=
True
,
# Whether to dump content list files
f_make_md_mode
=
MakeMode
.
MM_MD
,
# The mode for making markdown content, default is MM_MD
start_page_id
=
0
,
# Start page ID for parsing, default is 0
end_page_id
=
None
,
# End page ID for parsing, default is None (parse all pages until the end of the document)
):
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
new_pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
,
start_page_id
,
end_page_id
)
pdf_bytes_list
[
idx
]
=
new_pdf_bytes
(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
,
)
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
pdf_file_name
=
pdf_file_names
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
images_list
=
all_image_lists
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
formula_enable
,
)
pdf_info
=
middle_json
[
"pdf_info"
]
pdf_bytes
=
pdf_bytes_list
[
idx
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
,
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
,
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
"
{
pdf_file_name
}
_origin.pdf"
,
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
pipeline_union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
pipeline_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
def
test_pipeline_with_two_config
():
md_writer
.
write_string
(
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
f
"
{
pdf_file_name
}
_middle.json"
,
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
)
pdf_suffixes
=
[
".pdf"
]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
if
f_dump_model_output
:
md_writer
.
write_string
(
doc_path_list
=
[]
f
"
{
pdf_file_name
}
_model.json"
,
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
json
.
dumps
(
model_json
,
ensure_ascii
=
False
,
indent
=
4
),
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
)
doc_path_list
.
append
(
doc_path
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
os
.
environ
[
"MINERU_MODEL_SOURCE"
]
=
"modelscope"
def
parse_doc
(
pdf_file_names
=
[]
path_list
:
list
[
Path
],
pdf_bytes_list
=
[]
output_dir
,
p_lang_list
=
[]
lang
=
"ch"
,
for
path
in
doc_path_list
:
method
=
"auto"
,
file_name
=
str
(
Path
(
path
).
stem
)
start_page_id
=
0
,
pdf_bytes
=
read_fn
(
path
)
end_page_id
=
None
,
pdf_file_names
.
append
(
file_name
)
):
pdf_bytes_list
.
append
(
pdf_bytes
)
file_name_list
=
[]
p_lang_list
.
append
(
"en"
)
pdf_bytes_list
=
[]
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
lang_list
=
[]
new_pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
)
for
path
in
path_list
:
pdf_bytes_list
[
idx
]
=
new_pdf_bytes
file_name
=
str
(
Path
(
path
).
stem
)
pdf_bytes
=
read_fn
(
path
)
# 获取 pipline 分析结果, 分别测试 txt 和 ocr 两种解析方法的结果
file_name_list
.
append
(
file_name
)
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
(
pdf_bytes_list
.
append
(
pdf_bytes
)
pipeline_doc_analyze
(
lang_list
.
append
(
lang
)
pdf_bytes_list
,
# 运行两次 do_parse,分别是开启公式和表格解析和不开启
p_lang_list
,
do_parse
(
parse_method
=
"txt"
,
output_dir
=
output_dir
,
)
pdf_file_names
=
file_name_list
,
)
pdf_bytes_list
=
pdf_bytes_list
,
write_infer_result
(
p_lang_list
=
lang_list
,
infer_results
,
parse_method
=
method
,
all_image_lists
,
start_page_id
=
start_page_id
,
all_pdf_docs
,
end_page_id
=
end_page_id
,
lang_list
,
)
ocr_enabled_list
,
do_parse
(
pdf_file_names
,
output_dir
=
output_dir
,
output_dir
,
pdf_file_names
=
file_name_list
,
parse_method
=
"txt"
,
pdf_bytes_list
=
pdf_bytes_list
,
)
p_lang_list
=
lang_list
,
assert_content
(
"tests/unittest/output/test/txt/test_content_list.json"
)
parse_method
=
method
,
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
(
table_enable
=
False
,
pipeline_doc_analyze
(
formula_enable
=
False
,
pdf_bytes_list
,
start_page_id
=
start_page_id
,
p_lang_list
,
end_page_id
=
end_page_id
,
parse_method
=
"ocr"
,
)
)
)
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
write_infer_result
(
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
infer_results
,
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
all_image_lists
,
pdf_suffixes
=
[
".pdf"
]
all_pdf_docs
,
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
lang_list
,
ocr_enabled_list
,
doc_path_list
=
[]
pdf_file_names
,
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
output_dir
,
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
parse_method
=
"ocr"
,
doc_path_list
.
append
(
doc_path
)
)
assert_content
(
"tests/unittest/output/test/ocr/test_content_list.json"
)
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc
(
doc_path_list
,
output_dir
)
def
test_vlm_transformers_with_default_config
():
def
test_vlm_transformers_with_default_config
(
self
):
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
def
do_parse
(
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
output_dir
,
# Output directory for storing parsing results
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
pdf_file_names
:
list
[
str
],
# List of PDF file names to be parsed
pdf_suffixes
=
[
".pdf"
]
pdf_bytes_list
:
list
[
bytes
],
# List of PDF bytes to be parsed
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
server_url
=
None
,
# Server URL for vlm-sglang-client backend
f_draw_layout_bbox
=
True
,
# Whether to draw layout bounding boxes
doc_path_list
=
[]
f_dump_md
=
True
,
# Whether to dump markdown files
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
f_dump_middle_json
=
True
,
# Whether to dump middle JSON files
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
f_dump_model_output
=
True
,
# Whether to dump model output files
doc_path_list
.
append
(
doc_path
)
f_dump_orig_pdf
=
True
,
# Whether to dump original PDF files
f_dump_content_list
=
True
,
# Whether to dump content list files
os
.
environ
[
"MINERU_MODEL_SOURCE"
]
=
"modelscope"
f_make_md_mode
=
MakeMode
.
MM_MD
,
# The mode for making markdown content, default is MM_MD
start_page_id
=
0
,
# Start page ID for parsing, default is 0
pdf_file_names
=
[]
end_page_id
=
None
,
# End page ID for parsing, default is None (parse all pages until the end of the document)
pdf_bytes_list
=
[]
):
p_lang_list
=
[]
backend
=
"transformers"
for
path
in
doc_path_list
:
f_draw_span_bbox
=
False
file_name
=
str
(
Path
(
path
).
stem
)
parse_method
=
"vlm"
pdf_bytes
=
read_fn
(
path
)
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
pdf_file_names
.
append
(
file_name
)
pdf_file_name
=
pdf_file_names
[
idx
]
pdf_bytes_list
.
append
(
pdf_bytes
)
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
p_lang_list
.
append
(
"en"
)
pdf_bytes
,
start_page_id
,
end_page_id
)
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
local_image_dir
,
local_md_dir
=
prepare_env
(
pdf_file_name
=
pdf_file_names
[
idx
]
output_dir
,
pdf_file_name
,
parse_method
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
=
"vlm"
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
middle_json
,
infer_result
=
vlm_doc_analyze
(
pdf_bytes
,
image_writer
=
image_writer
,
backend
=
"transformers"
)
pdf_info
=
middle_json
[
"pdf_info"
]
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
vlm_union_make
(
pdf_info
,
MakeMode
.
MM_MD
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
content_list
=
vlm_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
model_output
=
(
"
\n
"
+
"-"
*
50
+
"
\n
"
).
join
(
infer_result
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model_output.txt"
,
model_output
,
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
assert_content
(
"tests/unittest/output/test/vlm/test_content_list.json"
)
def
write_infer_result
(
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
,
pdf_file_names
,
output_dir
,
parse_method
,
):
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
pdf_file_name
=
pdf_file_names
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
images_list
=
all_image_lists
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
True
,
)
pdf_info
=
middle_json
[
"pdf_info"
]
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
# 写入 md 文件
md_content_str
=
pipeline_union_make
(
pdf_info
,
MakeMode
.
MM_MD
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
content_list
=
pipeline_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model.json"
,
json
.
dumps
(
model_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
def
validate_html
(
html_content
):
try
:
soup
=
BeautifulSoup
(
html_content
,
"html.parser"
)
return
True
except
Exception
as
e
:
return
False
def
assert_content
(
content_path
):
content_list
=
[]
with
open
(
content_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
content_list
=
json
.
load
(
file
)
type_set
=
set
()
for
content_dict
in
content_list
:
match
content_dict
[
"type"
]:
# 图片校验,只校验 Caption
case
"image"
:
type_set
.
add
(
"image"
)
assert
(
content_dict
[
"image_caption"
][
0
].
strip
().
lower
()
==
"Figure 1: Figure Caption"
.
lower
()
)
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
# 表格校验,校验 Caption,表格格式和表格内容
local_image_dir
case
"table"
:
),
FileBasedDataWriter
(
local_md_dir
)
type_set
.
add
(
"table"
)
middle_json
,
infer_result
=
vlm_doc_analyze
(
assert
(
pdf_bytes
,
content_dict
[
"table_caption"
][
0
].
strip
().
lower
()
image_writer
=
image_writer
,
==
"Table 1: Table Caption"
.
lower
()
backend
=
backend
,
server_url
=
server_url
,
)
)
assert
validate_html
(
content_dict
[
"table_body"
])
pdf_info
=
middle_json
[
"pdf_info"
]
target_str_list
=
[
"Linear Regression"
,
if
f_draw_layout_bbox
:
"0.98740"
,
draw_layout_bbox
(
"1321.2"
,
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
"2-order Polynomial"
,
)
"0.99906"
,
"26.4"
,
if
f_draw_span_bbox
:
"3-order Polynomial"
,
draw_span_bbox
(
"0.99913"
,
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
"101.2"
,
)
"4-order Polynomial"
,
"0.99914"
,
if
f_dump_orig_pdf
:
"94.1"
,
md_writer
.
write
(
"Gray Prediction"
,
f
"
{
pdf_file_name
}
_origin.pdf"
,
"0.00617"
,
pdf_bytes
,
"687"
,
)
]
correct_count
=
0
if
f_dump_md
:
for
target_str
in
target_str_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
if
target_str
in
content_dict
[
"table_body"
]:
md_content_str
=
vlm_union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
correct_count
+=
1
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
assert
correct_count
>
0.9
*
len
(
target_str_list
)
md_content_str
,
# 公式校验,检测是否含有公式元素
case
"equation"
:
type_set
.
add
(
"equation"
)
target_str_list
=
[
"$$"
,
"lambda"
,
"frac"
,
"bar"
]
for
target_str
in
target_str_list
:
assert
target_str
in
content_dict
[
"text"
]
# 文本校验,文本相似度超过90
case
"text"
:
type_set
.
add
(
"text"
)
assert
(
fuzz
.
ratio
(
content_dict
[
"text"
],
"Trump graduated from the Wharton School of the University of Pennsylvania with a bachelor's degree in 1968. He became president of his father's real estate business in 1971 and renamed it The Trump Organization."
,
)
)
>
90
if
f_dump_content_list
:
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
assert
len
(
type_set
)
>=
4
content_list
=
vlm_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_model_output
:
model_output
=
(
"
\n
"
+
"-"
*
50
+
"
\n
"
).
join
(
infer_result
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model_output.txt"
,
model_output
,
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
def
parse_doc
(
path_list
:
list
[
Path
],
output_dir
,
lang
=
"ch"
,
server_url
=
None
,
start_page_id
=
0
,
end_page_id
=
None
,
):
file_name_list
=
[]
pdf_bytes_list
=
[]
lang_list
=
[]
for
path
in
path_list
:
file_name
=
str
(
Path
(
path
).
stem
)
pdf_bytes
=
read_fn
(
path
)
file_name_list
.
append
(
file_name
)
pdf_bytes_list
.
append
(
pdf_bytes
)
lang_list
.
append
(
lang
)
do_parse
(
output_dir
=
output_dir
,
pdf_file_names
=
file_name_list
,
pdf_bytes_list
=
pdf_bytes_list
,
server_url
=
server_url
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
)
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
pdf_suffixes
=
[
".pdf"
]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
doc_path_list
=
[]
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
"*"
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
doc_path_list
.
append
(
doc_path
)
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc
(
doc_path_list
,
output_dir
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment