Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
1b35f044
"vscode:/vscode.git/clone" did not exist on "614acf2c28aaff37a22ac3ec9901c492ba325c8c"
Unverified
Commit
1b35f044
authored
Apr 16, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Apr 16, 2025
Browse files
Merge pull request #2252 from opendatalab/release-1.3.4
Release 1.3.4
parents
8f3c1780
0222293f
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
214 additions
and
29 deletions
+214
-29
README.md
README.md
+3
-0
README_zh-CN.md
README_zh-CN.md
+3
-0
docker/china/Dockerfile
docker/china/Dockerfile
+10
-0
docker/global/Dockerfile
docker/global/Dockerfile
+10
-0
magic_pdf/model/sub_modules/model_utils.py
magic_pdf/model/sub_modules/model_utils.py
+59
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+8
-4
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+1
-1
magic_pdf/utils/office_to_pdf.py
magic_pdf/utils/office_to_pdf.py
+100
-5
tests/unittest/test_table/test_rapidtable.py
tests/unittest/test_table/test_rapidtable.py
+20
-18
No files found.
README.md
View file @
1b35f044
...
...
@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div>
# Changelog
-
2025/04/16 1.3.4 Released
-
Slightly improved the speed of OCR detection by removing some unused blocks.
-
Fixed page-level sorting errors caused by footnotes in certain cases.
-
2025/04/12 1.3.2 released
-
Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
-
Optimized memory usage during batch inference.
...
...
README_zh-CN.md
View file @
1b35f044
...
...
@@ -47,6 +47,9 @@
</div>
# 更新记录
-
2025/04/16 1.3.4 发布
-
通过移除一些无用的块,小幅提升了ocr-det的速度
-
修复部分情况下由footnote导致的页面内排序错误
-
2025/04/12 1.3.2 发布
-
修复了windows系统下,在python3.13环境安装时一些依赖包版本不兼容的问题
-
优化批量推理时的内存占用
...
...
docker/china/Dockerfile
View file @
1b35f044
...
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
wget
\
git
\
libgl1
\
libreoffice
\
fonts-noto-cjk
\
fonts-wqy-zenhei
\
fonts-wqy-microhei
\
ttf-mscorefonts-installer
\
fontconfig
\
libglib2.0-0
\
libxrender1
\
libsm6
\
libxext6
\
poppler-utils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# Set Python 3.10 as the default python3
...
...
docker/global/Dockerfile
View file @
1b35f044
...
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
wget
\
git
\
libgl1
\
libreoffice
\
fonts-noto-cjk
\
fonts-wqy-zenhei
\
fonts-wqy-microhei
\
ttf-mscorefonts-installer
\
fontconfig
\
libglib2.0-0
\
libxrender1
\
libsm6
\
libxext6
\
poppler-utils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# Set Python 3.10 as the default python3
...
...
magic_pdf/model/sub_modules/model_utils.py
View file @
1b35f044
...
...
@@ -2,6 +2,8 @@ import time
import
torch
from
loguru
import
logger
import
numpy
as
np
from
magic_pdf.libs.boxbase
import
get_minbox_if_overlap_by_ratio
from
magic_pdf.libs.clean_memory
import
clean_memory
...
...
@@ -188,9 +190,46 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
return
[
table
for
i
,
table
in
enumerate
(
table_res_list
)
if
i
not
in
big_tables_idx
]
def
remove_overlaps_min_blocks
(
res_list
):
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
need_remove
=
[]
for
res1
in
res_list
:
for
res2
in
res_list
:
if
res1
!=
res2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
res1
[
'bbox'
],
res2
[
'bbox'
],
0.8
)
if
overlap_box
is
not
None
:
res_to_remove
=
next
(
(
res
for
res
in
res_list
if
res
[
'bbox'
]
==
overlap_box
),
None
,
)
if
(
res_to_remove
is
not
None
and
res_to_remove
not
in
need_remove
):
large_res
=
res1
if
res1
!=
res_to_remove
else
res2
x1
,
y1
,
x2
,
y2
=
large_res
[
'bbox'
]
sx1
,
sy1
,
sx2
,
sy2
=
res_to_remove
[
'bbox'
]
x1
=
min
(
x1
,
sx1
)
y1
=
min
(
y1
,
sy1
)
x2
=
max
(
x2
,
sx2
)
y2
=
max
(
y2
,
sy2
)
large_res
[
'bbox'
]
=
[
x1
,
y1
,
x2
,
y2
]
need_remove
.
append
(
res_to_remove
)
if
len
(
need_remove
)
>
0
:
for
res
in
need_remove
:
res_list
.
remove
(
res
)
return
res_list
,
need_remove
def
get_res_list_from_layout_res
(
layout_res
,
iou_threshold
=
0.7
,
overlap_threshold
=
0.8
,
area_threshold
=
0.8
):
"""Extract OCR, table and other regions from layout results."""
ocr_res_list
=
[]
text_res_list
=
[]
table_res_list
=
[]
table_indices
=
[]
single_page_mfdetrec_res
=
[]
...
...
@@ -204,11 +243,14 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
"bbox"
:
[
int
(
res
[
'poly'
][
0
]),
int
(
res
[
'poly'
][
1
]),
int
(
res
[
'poly'
][
4
]),
int
(
res
[
'poly'
][
5
])],
})
elif
category_id
in
[
0
,
1
,
2
,
4
,
6
,
7
]:
# OCR regions
elif
category_id
in
[
0
,
2
,
4
,
6
,
7
]:
# OCR regions
ocr_res_list
.
append
(
res
)
elif
category_id
==
5
:
# Table regions
table_res_list
.
append
(
res
)
table_indices
.
append
(
i
)
elif
category_id
in
[
1
]:
# Text regions
res
[
'bbox'
]
=
[
int
(
res
[
'poly'
][
0
]),
int
(
res
[
'poly'
][
1
]),
int
(
res
[
'poly'
][
4
]),
int
(
res
[
'poly'
][
5
])]
text_res_list
.
append
(
res
)
# Process tables: merge high IoU tables first, then filter nested tables
table_res_list
,
table_indices
=
merge_high_iou_tables
(
...
...
@@ -226,6 +268,22 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
for
idx
in
sorted
(
to_remove
,
reverse
=
True
):
del
layout_res
[
idx
]
# Remove overlaps in OCR and text regions
text_res_list
,
need_remove
=
remove_overlaps_min_blocks
(
text_res_list
)
for
res
in
text_res_list
:
# 将res的poly使用bbox重构
res
[
'poly'
]
=
[
res
[
'bbox'
][
0
],
res
[
'bbox'
][
1
],
res
[
'bbox'
][
2
],
res
[
'bbox'
][
1
],
res
[
'bbox'
][
2
],
res
[
'bbox'
][
3
],
res
[
'bbox'
][
0
],
res
[
'bbox'
][
3
]]
# 删除res的bbox
del
res
[
'bbox'
]
ocr_res_list
.
extend
(
text_res_list
)
if
len
(
need_remove
)
>
0
:
for
res
in
need_remove
:
del
res
[
'bbox'
]
layout_res
.
remove
(
res
)
return
ocr_res_list
,
filtered_table_res_list
,
single_page_mfdetrec_res
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
1b35f044
...
...
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
return
[[
x0
,
y0
,
x1
,
y1
]]
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
):
page_line_list
=
[]
def
add_lines_to_block
(
b
):
...
...
@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
block
[
'real_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
add_lines_to_block
(
block
)
for
block
in
footnote_blocks
:
footnote_block
=
{
'bbox'
:
block
[:
4
]}
add_lines_to_block
(
footnote_block
)
if
len
(
page_line_list
)
>
200
:
# layoutreader最高支持512line
return
None
...
...
@@ -779,7 +783,7 @@ def parse_page_core(
# interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
...
...
@@ -790,7 +794,7 @@ def parse_page_core(
page_h
,
)
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
...
...
@@ -866,7 +870,7 @@ def parse_page_core(
line_height
=
get_line_height
(
fix_blocks
)
"""获取所有line并对line排序"""
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
)
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
)
"""根据line的中位数算block的序列关系"""
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
1b35f044
...
...
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
return
all_bboxes
,
all_discarded_blocks
return
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
def
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
):
...
...
magic_pdf/utils/office_to_pdf.py
View file @
1b35f044
import
os
import
subprocess
import
platform
from
pathlib
import
Path
import
shutil
class
ConvertToPdfError
(
Exception
):
...
...
@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception):
super
().
__init__
(
self
.
msg
)
# Chinese font list
REQUIRED_CHS_FONTS
=
[
'SimSun'
,
'Microsoft YaHei'
,
'Noto Sans CJK SC'
]
def
check_fonts_installed
():
"""Check if required Chinese fonts are installed."""
system_type
=
platform
.
system
()
if
system_type
==
'Windows'
:
# Windows: check fonts via registry or system font folder
font_dir
=
Path
(
"C:/Windows/Fonts"
)
installed_fonts
=
[
f
.
name
for
f
in
font_dir
.
glob
(
"*.ttf"
)]
if
any
(
font
for
font
in
REQUIRED_CHS_FONTS
if
any
(
font
in
f
for
f
in
installed_fonts
)):
return
True
raise
EnvironmentError
(
f
"Missing Chinese font. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
)
else
:
# Linux/macOS: use fc-list
try
:
output
=
subprocess
.
check_output
([
'fc-list'
,
':lang=zh'
],
encoding
=
'utf-8'
)
for
font
in
REQUIRED_CHS_FONTS
:
if
font
in
output
:
return
True
raise
EnvironmentError
(
f
"Missing Chinese font. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
)
except
Exception
as
e
:
raise
EnvironmentError
(
f
"Font detection failed. Please install 'fontconfig' and fonts:
{
str
(
e
)
}
"
)
def
get_soffice_command
():
"""Return the path to LibreOffice's soffice executable depending on the platform."""
system_type
=
platform
.
system
()
# First check if soffice is in PATH
soffice_path
=
shutil
.
which
(
'soffice'
)
if
soffice_path
:
return
soffice_path
if
system_type
==
'Windows'
:
# Check common installation paths
possible_paths
=
[
Path
(
os
.
environ
.
get
(
'PROGRAMFILES'
,
'C:/Program Files'
))
/
'LibreOffice/program/soffice.exe'
,
Path
(
os
.
environ
.
get
(
'PROGRAMFILES(X86)'
,
'C:/Program Files (x86)'
))
/
'LibreOffice/program/soffice.exe'
,
Path
(
'C:/Program Files/LibreOffice/program/soffice.exe'
),
Path
(
'C:/Program Files (x86)/LibreOffice/program/soffice.exe'
)
]
# Check other drives for windows
for
drive
in
[
'C:'
,
'D:'
,
'E:'
,
'F:'
,
'G:'
,
'H:'
]:
possible_paths
.
append
(
Path
(
f
"
{
drive
}
/LibreOffice/program/soffice.exe"
))
for
path
in
possible_paths
:
if
path
.
exists
():
return
str
(
path
)
raise
ConvertToPdfError
(
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
"or ensure soffice.exe is in your PATH environment variable."
)
else
:
# For Linux/macOS, provide installation instructions if not found
try
:
# Try to find soffice in standard locations
possible_paths
=
[
'/usr/bin/soffice'
,
'/usr/local/bin/soffice'
,
'/opt/libreoffice/program/soffice'
,
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
]
for
path
in
possible_paths
:
if
os
.
path
.
exists
(
path
):
return
path
raise
ConvertToPdfError
(
"LibreOffice not found. Please install it:
\n
"
" - Ubuntu/Debian: sudo apt-get install libreoffice
\n
"
" - CentOS/RHEL: sudo yum install libreoffice
\n
"
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/
\n
"
" - Or ensure soffice is in your PATH environment variable."
)
except
Exception
as
e
:
raise
ConvertToPdfError
(
f
"Error locating LibreOffice:
{
str
(
e
)
}
"
)
def
convert_file_to_pdf
(
input_path
,
output_dir
):
"""Convert a single document (ppt, doc, etc.) to PDF."""
if
not
os
.
path
.
isfile
(
input_path
):
raise
FileNotFoundError
(
f
"The input file
{
input_path
}
does not exist."
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
check_fonts_installed
()
soffice_cmd
=
get_soffice_command
()
cmd
=
[
'
soffice
'
,
soffice
_cmd
,
'--headless'
,
'--norestore'
,
'--invisible'
,
'--convert-to'
,
'pdf'
,
'--outdir'
,
str
(
output_dir
),
str
(
input_path
)
]
process
=
subprocess
.
run
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
if
process
.
returncode
!=
0
:
raise
ConvertToPdfError
(
process
.
stderr
.
decode
())
raise
ConvertToPdfError
(
f
"LibreOffice convert failed:
{
process
.
stderr
.
decode
()
}
"
)
tests/unittest/test_table/test_table
master
.py
→
tests/unittest/test_table/test_
rapid
table.py
View file @
1b35f044
...
...
@@ -2,31 +2,34 @@ import unittest
from
PIL
import
Image
from
lxml
import
etree
from
magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle
import
TableMasterPaddleModel
from
magic_pdf.model.sub_modules.model_init
import
AtomModelSingleton
from
magic_pdf.model.sub_modules.table.rapidtable.rapid_table
import
RapidTableModel
class
TestppTableModel
(
unittest
.
TestCase
):
def
test_image2html
(
self
):
img
=
Image
.
open
(
"tests/unittest/test_table/assets/table.jpg"
)
# 修改table模型路径
config
=
{
"device"
:
"cuda"
,
"model_dir"
:
"/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"
}
table_model
=
TableMasterPaddleModel
(
config
)
res
=
table_model
.
img2html
(
img
)
img
=
Image
.
open
(
"assets/table.jpg"
)
atom_model_manager
=
AtomModelSingleton
()
ocr_engine
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
'ocr'
,
ocr_show_log
=
False
,
det_db_box_thresh
=
0.5
,
det_db_unclip_ratio
=
1.6
,
lang
=
'ch'
)
table_model
=
RapidTableModel
(
ocr_engine
,
'slanet_plus'
)
html_code
,
table_cell_bboxes
,
logic_points
,
elapse
=
table_model
.
predict
(
img
)
# 验证生成的 HTML 是否符合预期
parser
=
etree
.
HTMLParser
()
tree
=
etree
.
fromstring
(
res
,
parser
)
tree
=
etree
.
fromstring
(
html_code
,
parser
)
# 检查 HTML 结构
assert
tree
.
find
(
'.//table'
)
is
not
None
,
"HTML should contain a <table> element"
assert
tree
.
find
(
'.//thead'
)
is
not
None
,
"HTML should contain a <thead> element"
assert
tree
.
find
(
'.//tbody'
)
is
not
None
,
"HTML should contain a <tbody> element"
assert
tree
.
find
(
'.//tr'
)
is
not
None
,
"HTML should contain a <tr> element"
assert
tree
.
find
(
'.//td'
)
is
not
None
,
"HTML should contain a <td> element"
# 检查具体的表格内容
headers
=
tree
.
xpath
(
'//thead/tr/td/b'
)
print
(
headers
)
# Print headers for debugging
headers
=
tree
.
xpath
(
'//table/tr[1]/td'
)
assert
len
(
headers
)
==
5
,
"Thead should have 5 columns"
assert
headers
[
0
].
text
and
headers
[
0
].
text
.
strip
()
==
"Methods"
,
"First header should be 'Methods'"
assert
headers
[
1
].
text
and
headers
[
1
].
text
.
strip
()
==
"R"
,
"Second header should be 'R'"
...
...
@@ -35,7 +38,7 @@ class TestppTableModel(unittest.TestCase):
assert
headers
[
4
].
text
and
headers
[
4
].
text
.
strip
()
==
"FPS"
,
"Fifth header should be 'FPS'"
# 检查第一行数据
first_row
=
tree
.
xpath
(
'//t
body
/tr[
1
]/td'
)
first_row
=
tree
.
xpath
(
'//t
able
/tr[
2
]/td'
)
assert
len
(
first_row
)
==
5
,
"First row should have 5 cells"
assert
first_row
[
0
].
text
and
first_row
[
0
].
text
.
strip
()
==
"SegLink[26]"
,
"First cell should be 'SegLink[26]'"
assert
first_row
[
1
].
text
and
first_row
[
1
].
text
.
strip
()
==
"70.0"
,
"Second cell should be '70.0'"
...
...
@@ -44,14 +47,13 @@ class TestppTableModel(unittest.TestCase):
assert
first_row
[
4
].
text
and
first_row
[
4
].
text
.
strip
()
==
"8.9"
,
"Fifth cell should be '8.9'"
# 检查倒数第二行数据
second_last_row
=
tree
.
xpath
(
'//t
body
/tr[position()=last()-1]/td'
)
second_last_row
=
tree
.
xpath
(
'//t
able
/tr[position()=last()-1]/td'
)
assert
len
(
second_last_row
)
==
5
,
"second_last_row should have 5 cells"
assert
second_last_row
[
0
].
text
and
second_last_row
[
0
].
text
.
strip
()
==
"Ours (SynText)"
,
"First cell should be 'Ours (SynText)'"
assert
second_last_row
[
0
].
text
and
second_last_row
[
0
].
text
.
strip
()
==
"Ours (SynText)"
,
"First cell should be 'Ours (SynText)'"
assert
second_last_row
[
1
].
text
and
second_last_row
[
1
].
text
.
strip
()
==
"80.68"
,
"Second cell should be '80.68'"
assert
second_last_row
[
2
].
text
and
second_last_row
[
2
].
text
.
strip
()
==
"85.40"
,
"Third cell should be '85.40'"
assert
second_last_row
[
3
].
text
and
second_last_row
[
3
].
text
.
strip
()
==
"82.97"
,
"Fourth cell should be '82.97'"
assert
second_last_row
[
3
].
text
and
second_last_row
[
4
].
text
.
strip
()
==
"12.68"
,
"Fifth cell should be '12.68'"
#
assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
#
assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment