Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
1b35f044
Unverified
Commit
1b35f044
authored
Apr 16, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Apr 16, 2025
Browse files
Merge pull request #2252 from opendatalab/release-1.3.4
Release 1.3.4
parents
8f3c1780
0222293f
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
214 additions
and
29 deletions
+214
-29
README.md
README.md
+3
-0
README_zh-CN.md
README_zh-CN.md
+3
-0
docker/china/Dockerfile
docker/china/Dockerfile
+10
-0
docker/global/Dockerfile
docker/global/Dockerfile
+10
-0
magic_pdf/model/sub_modules/model_utils.py
magic_pdf/model/sub_modules/model_utils.py
+59
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+8
-4
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+1
-1
magic_pdf/utils/office_to_pdf.py
magic_pdf/utils/office_to_pdf.py
+100
-5
tests/unittest/test_table/test_rapidtable.py
tests/unittest/test_table/test_rapidtable.py
+20
-18
No files found.
README.md
View file @
1b35f044
...
@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
...
@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div>
</div>
# Changelog
# Changelog
-
2025/04/16 1.3.4 Released
-
Slightly improved the speed of OCR detection by removing some unused blocks.
-
Fixed page-level sorting errors caused by footnotes in certain cases.
-
2025/04/12 1.3.2 released
-
2025/04/12 1.3.2 released
-
Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
-
Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
-
Optimized memory usage during batch inference.
-
Optimized memory usage during batch inference.
...
...
README_zh-CN.md
View file @
1b35f044
...
@@ -47,6 +47,9 @@
...
@@ -47,6 +47,9 @@
</div>
</div>
# 更新记录
# 更新记录
-
2025/04/16 1.3.4 发布
-
通过移除一些无用的块,小幅提升了ocr-det的速度
-
修复部分情况下由footnote导致的页面内排序错误
-
2025/04/12 1.3.2 发布
-
2025/04/12 1.3.2 发布
-
修复了windows系统下,在python3.13环境安装时一些依赖包版本不兼容的问题
-
修复了windows系统下,在python3.13环境安装时一些依赖包版本不兼容的问题
-
优化批量推理时的内存占用
-
优化批量推理时的内存占用
...
...
docker/china/Dockerfile
View file @
1b35f044
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
wget
\
wget
\
git
\
git
\
libgl1
\
libgl1
\
libreoffice
\
fonts-noto-cjk
\
fonts-wqy-zenhei
\
fonts-wqy-microhei
\
ttf-mscorefonts-installer
\
fontconfig
\
libglib2.0-0
\
libglib2.0-0
\
libxrender1
\
libsm6
\
libxext6
\
poppler-utils
\
&&
rm
-rf
/var/lib/apt/lists/
*
&&
rm
-rf
/var/lib/apt/lists/
*
# Set Python 3.10 as the default python3
# Set Python 3.10 as the default python3
...
...
docker/global/Dockerfile
View file @
1b35f044
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
wget
\
wget
\
git
\
git
\
libgl1
\
libgl1
\
libreoffice
\
fonts-noto-cjk
\
fonts-wqy-zenhei
\
fonts-wqy-microhei
\
ttf-mscorefonts-installer
\
fontconfig
\
libglib2.0-0
\
libglib2.0-0
\
libxrender1
\
libsm6
\
libxext6
\
poppler-utils
\
&&
rm
-rf
/var/lib/apt/lists/
*
&&
rm
-rf
/var/lib/apt/lists/
*
# Set Python 3.10 as the default python3
# Set Python 3.10 as the default python3
...
...
magic_pdf/model/sub_modules/model_utils.py
View file @
1b35f044
...
@@ -2,6 +2,8 @@ import time
...
@@ -2,6 +2,8 @@ import time
import
torch
import
torch
from
loguru
import
logger
from
loguru
import
logger
import
numpy
as
np
import
numpy
as
np
from
magic_pdf.libs.boxbase
import
get_minbox_if_overlap_by_ratio
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.clean_memory
import
clean_memory
...
@@ -188,9 +190,46 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
...
@@ -188,9 +190,46 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
return
[
table
for
i
,
table
in
enumerate
(
table_res_list
)
if
i
not
in
big_tables_idx
]
return
[
table
for
i
,
table
in
enumerate
(
table_res_list
)
if
i
not
in
big_tables_idx
]
def
remove_overlaps_min_blocks
(
res_list
):
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
need_remove
=
[]
for
res1
in
res_list
:
for
res2
in
res_list
:
if
res1
!=
res2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
res1
[
'bbox'
],
res2
[
'bbox'
],
0.8
)
if
overlap_box
is
not
None
:
res_to_remove
=
next
(
(
res
for
res
in
res_list
if
res
[
'bbox'
]
==
overlap_box
),
None
,
)
if
(
res_to_remove
is
not
None
and
res_to_remove
not
in
need_remove
):
large_res
=
res1
if
res1
!=
res_to_remove
else
res2
x1
,
y1
,
x2
,
y2
=
large_res
[
'bbox'
]
sx1
,
sy1
,
sx2
,
sy2
=
res_to_remove
[
'bbox'
]
x1
=
min
(
x1
,
sx1
)
y1
=
min
(
y1
,
sy1
)
x2
=
max
(
x2
,
sx2
)
y2
=
max
(
y2
,
sy2
)
large_res
[
'bbox'
]
=
[
x1
,
y1
,
x2
,
y2
]
need_remove
.
append
(
res_to_remove
)
if
len
(
need_remove
)
>
0
:
for
res
in
need_remove
:
res_list
.
remove
(
res
)
return
res_list
,
need_remove
def
get_res_list_from_layout_res
(
layout_res
,
iou_threshold
=
0.7
,
overlap_threshold
=
0.8
,
area_threshold
=
0.8
):
def
get_res_list_from_layout_res
(
layout_res
,
iou_threshold
=
0.7
,
overlap_threshold
=
0.8
,
area_threshold
=
0.8
):
"""Extract OCR, table and other regions from layout results."""
"""Extract OCR, table and other regions from layout results."""
ocr_res_list
=
[]
ocr_res_list
=
[]
text_res_list
=
[]
table_res_list
=
[]
table_res_list
=
[]
table_indices
=
[]
table_indices
=
[]
single_page_mfdetrec_res
=
[]
single_page_mfdetrec_res
=
[]
...
@@ -204,11 +243,14 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
...
@@ -204,11 +243,14 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
"bbox"
:
[
int
(
res
[
'poly'
][
0
]),
int
(
res
[
'poly'
][
1
]),
"bbox"
:
[
int
(
res
[
'poly'
][
0
]),
int
(
res
[
'poly'
][
1
]),
int
(
res
[
'poly'
][
4
]),
int
(
res
[
'poly'
][
5
])],
int
(
res
[
'poly'
][
4
]),
int
(
res
[
'poly'
][
5
])],
})
})
elif
category_id
in
[
0
,
1
,
2
,
4
,
6
,
7
]:
# OCR regions
elif
category_id
in
[
0
,
2
,
4
,
6
,
7
]:
# OCR regions
ocr_res_list
.
append
(
res
)
ocr_res_list
.
append
(
res
)
elif
category_id
==
5
:
# Table regions
elif
category_id
==
5
:
# Table regions
table_res_list
.
append
(
res
)
table_res_list
.
append
(
res
)
table_indices
.
append
(
i
)
table_indices
.
append
(
i
)
elif
category_id
in
[
1
]:
# Text regions
res
[
'bbox'
]
=
[
int
(
res
[
'poly'
][
0
]),
int
(
res
[
'poly'
][
1
]),
int
(
res
[
'poly'
][
4
]),
int
(
res
[
'poly'
][
5
])]
text_res_list
.
append
(
res
)
# Process tables: merge high IoU tables first, then filter nested tables
# Process tables: merge high IoU tables first, then filter nested tables
table_res_list
,
table_indices
=
merge_high_iou_tables
(
table_res_list
,
table_indices
=
merge_high_iou_tables
(
...
@@ -226,6 +268,22 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
...
@@ -226,6 +268,22 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
for
idx
in
sorted
(
to_remove
,
reverse
=
True
):
for
idx
in
sorted
(
to_remove
,
reverse
=
True
):
del
layout_res
[
idx
]
del
layout_res
[
idx
]
# Remove overlaps in OCR and text regions
text_res_list
,
need_remove
=
remove_overlaps_min_blocks
(
text_res_list
)
for
res
in
text_res_list
:
# 将res的poly使用bbox重构
res
[
'poly'
]
=
[
res
[
'bbox'
][
0
],
res
[
'bbox'
][
1
],
res
[
'bbox'
][
2
],
res
[
'bbox'
][
1
],
res
[
'bbox'
][
2
],
res
[
'bbox'
][
3
],
res
[
'bbox'
][
0
],
res
[
'bbox'
][
3
]]
# 删除res的bbox
del
res
[
'bbox'
]
ocr_res_list
.
extend
(
text_res_list
)
if
len
(
need_remove
)
>
0
:
for
res
in
need_remove
:
del
res
[
'bbox'
]
layout_res
.
remove
(
res
)
return
ocr_res_list
,
filtered_table_res_list
,
single_page_mfdetrec_res
return
ocr_res_list
,
filtered_table_res_list
,
single_page_mfdetrec_res
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
1b35f044
...
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
...
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
return
[[
x0
,
y0
,
x1
,
y1
]]
return
[[
x0
,
y0
,
x1
,
y1
]]
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
):
page_line_list
=
[]
page_line_list
=
[]
def
add_lines_to_block
(
b
):
def
add_lines_to_block
(
b
):
...
@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
...
@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
block
[
'real_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
block
[
'real_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
add_lines_to_block
(
block
)
add_lines_to_block
(
block
)
for
block
in
footnote_blocks
:
footnote_block
=
{
'bbox'
:
block
[:
4
]}
add_lines_to_block
(
footnote_block
)
if
len
(
page_line_list
)
>
200
:
# layoutreader最高支持512line
if
len
(
page_line_list
)
>
200
:
# layoutreader最高支持512line
return
None
return
None
...
@@ -779,7 +783,7 @@ def parse_page_core(
...
@@ -779,7 +783,7 @@ def parse_page_core(
# interline_equation_blocks参数不够准,后面切换到interline_equations上
# interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks
=
[]
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
discarded_blocks
,
...
@@ -790,7 +794,7 @@ def parse_page_core(
...
@@ -790,7 +794,7 @@ def parse_page_core(
page_h
,
page_h
,
)
)
else
:
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
discarded_blocks
,
...
@@ -866,7 +870,7 @@ def parse_page_core(
...
@@ -866,7 +870,7 @@ def parse_page_core(
line_height
=
get_line_height
(
fix_blocks
)
line_height
=
get_line_height
(
fix_blocks
)
"""获取所有line并对line排序"""
"""获取所有line并对line排序"""
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
)
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
)
"""根据line的中位数算block的序列关系"""
"""根据line的中位数算block的序列关系"""
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
1b35f044
...
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
...
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
return
all_bboxes
,
all_discarded_blocks
return
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
def
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
):
def
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
):
...
...
magic_pdf/utils/office_to_pdf.py
View file @
1b35f044
import
os
import
os
import
subprocess
import
subprocess
import
platform
from
pathlib
import
Path
from
pathlib
import
Path
import
shutil
class
ConvertToPdfError
(
Exception
):
class
ConvertToPdfError
(
Exception
):
...
@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception):
...
@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception):
super
().
__init__
(
self
.
msg
)
super
().
__init__
(
self
.
msg
)
# Chinese font list
REQUIRED_CHS_FONTS
=
[
'SimSun'
,
'Microsoft YaHei'
,
'Noto Sans CJK SC'
]
def
check_fonts_installed
():
"""Check if required Chinese fonts are installed."""
system_type
=
platform
.
system
()
if
system_type
==
'Windows'
:
# Windows: check fonts via registry or system font folder
font_dir
=
Path
(
"C:/Windows/Fonts"
)
installed_fonts
=
[
f
.
name
for
f
in
font_dir
.
glob
(
"*.ttf"
)]
if
any
(
font
for
font
in
REQUIRED_CHS_FONTS
if
any
(
font
in
f
for
f
in
installed_fonts
)):
return
True
raise
EnvironmentError
(
f
"Missing Chinese font. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
)
else
:
# Linux/macOS: use fc-list
try
:
output
=
subprocess
.
check_output
([
'fc-list'
,
':lang=zh'
],
encoding
=
'utf-8'
)
for
font
in
REQUIRED_CHS_FONTS
:
if
font
in
output
:
return
True
raise
EnvironmentError
(
f
"Missing Chinese font. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
)
except
Exception
as
e
:
raise
EnvironmentError
(
f
"Font detection failed. Please install 'fontconfig' and fonts:
{
str
(
e
)
}
"
)
def
get_soffice_command
():
"""Return the path to LibreOffice's soffice executable depending on the platform."""
system_type
=
platform
.
system
()
# First check if soffice is in PATH
soffice_path
=
shutil
.
which
(
'soffice'
)
if
soffice_path
:
return
soffice_path
if
system_type
==
'Windows'
:
# Check common installation paths
possible_paths
=
[
Path
(
os
.
environ
.
get
(
'PROGRAMFILES'
,
'C:/Program Files'
))
/
'LibreOffice/program/soffice.exe'
,
Path
(
os
.
environ
.
get
(
'PROGRAMFILES(X86)'
,
'C:/Program Files (x86)'
))
/
'LibreOffice/program/soffice.exe'
,
Path
(
'C:/Program Files/LibreOffice/program/soffice.exe'
),
Path
(
'C:/Program Files (x86)/LibreOffice/program/soffice.exe'
)
]
# Check other drives for windows
for
drive
in
[
'C:'
,
'D:'
,
'E:'
,
'F:'
,
'G:'
,
'H:'
]:
possible_paths
.
append
(
Path
(
f
"
{
drive
}
/LibreOffice/program/soffice.exe"
))
for
path
in
possible_paths
:
if
path
.
exists
():
return
str
(
path
)
raise
ConvertToPdfError
(
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
"or ensure soffice.exe is in your PATH environment variable."
)
else
:
# For Linux/macOS, provide installation instructions if not found
try
:
# Try to find soffice in standard locations
possible_paths
=
[
'/usr/bin/soffice'
,
'/usr/local/bin/soffice'
,
'/opt/libreoffice/program/soffice'
,
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
]
for
path
in
possible_paths
:
if
os
.
path
.
exists
(
path
):
return
path
raise
ConvertToPdfError
(
"LibreOffice not found. Please install it:
\n
"
" - Ubuntu/Debian: sudo apt-get install libreoffice
\n
"
" - CentOS/RHEL: sudo yum install libreoffice
\n
"
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/
\n
"
" - Or ensure soffice is in your PATH environment variable."
)
except
Exception
as
e
:
raise
ConvertToPdfError
(
f
"Error locating LibreOffice:
{
str
(
e
)
}
"
)
def
convert_file_to_pdf
(
input_path
,
output_dir
):
def
convert_file_to_pdf
(
input_path
,
output_dir
):
"""Convert a single document (ppt, doc, etc.) to PDF."""
if
not
os
.
path
.
isfile
(
input_path
):
if
not
os
.
path
.
isfile
(
input_path
):
raise
FileNotFoundError
(
f
"The input file
{
input_path
}
does not exist."
)
raise
FileNotFoundError
(
f
"The input file
{
input_path
}
does not exist."
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
check_fonts_installed
()
soffice_cmd
=
get_soffice_command
()
cmd
=
[
cmd
=
[
'
soffice
'
,
soffice
_cmd
,
'--headless'
,
'--headless'
,
'--norestore'
,
'--invisible'
,
'--convert-to'
,
'pdf'
,
'--convert-to'
,
'pdf'
,
'--outdir'
,
str
(
output_dir
),
'--outdir'
,
str
(
output_dir
),
str
(
input_path
)
str
(
input_path
)
]
]
process
=
subprocess
.
run
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
process
=
subprocess
.
run
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
if
process
.
returncode
!=
0
:
if
process
.
returncode
!=
0
:
raise
ConvertToPdfError
(
process
.
stderr
.
decode
())
raise
ConvertToPdfError
(
f
"LibreOffice convert failed:
{
process
.
stderr
.
decode
()
}
"
)
tests/unittest/test_table/test_table
master
.py
→
tests/unittest/test_table/test_
rapid
table.py
View file @
1b35f044
...
@@ -2,31 +2,34 @@ import unittest
...
@@ -2,31 +2,34 @@ import unittest
from
PIL
import
Image
from
PIL
import
Image
from
lxml
import
etree
from
lxml
import
etree
from
magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle
import
TableMasterPaddleModel
from
magic_pdf.model.sub_modules.model_init
import
AtomModelSingleton
from
magic_pdf.model.sub_modules.table.rapidtable.rapid_table
import
RapidTableModel
class
TestppTableModel
(
unittest
.
TestCase
):
class
TestppTableModel
(
unittest
.
TestCase
):
def
test_image2html
(
self
):
def
test_image2html
(
self
):
img
=
Image
.
open
(
"tests/unittest/test_table/assets/table.jpg"
)
img
=
Image
.
open
(
"assets/table.jpg"
)
# 修改table模型路径
atom_model_manager
=
AtomModelSingleton
()
config
=
{
"device"
:
"cuda"
,
ocr_engine
=
atom_model_manager
.
get_atom_model
(
"model_dir"
:
"/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"
}
atom_model_name
=
'ocr'
,
table_model
=
TableMasterPaddleModel
(
config
)
ocr_show_log
=
False
,
res
=
table_model
.
img2html
(
img
)
det_db_box_thresh
=
0.5
,
det_db_unclip_ratio
=
1.6
,
lang
=
'ch'
)
table_model
=
RapidTableModel
(
ocr_engine
,
'slanet_plus'
)
html_code
,
table_cell_bboxes
,
logic_points
,
elapse
=
table_model
.
predict
(
img
)
# 验证生成的 HTML 是否符合预期
# 验证生成的 HTML 是否符合预期
parser
=
etree
.
HTMLParser
()
parser
=
etree
.
HTMLParser
()
tree
=
etree
.
fromstring
(
res
,
parser
)
tree
=
etree
.
fromstring
(
html_code
,
parser
)
# 检查 HTML 结构
# 检查 HTML 结构
assert
tree
.
find
(
'.//table'
)
is
not
None
,
"HTML should contain a <table> element"
assert
tree
.
find
(
'.//table'
)
is
not
None
,
"HTML should contain a <table> element"
assert
tree
.
find
(
'.//thead'
)
is
not
None
,
"HTML should contain a <thead> element"
assert
tree
.
find
(
'.//tbody'
)
is
not
None
,
"HTML should contain a <tbody> element"
assert
tree
.
find
(
'.//tr'
)
is
not
None
,
"HTML should contain a <tr> element"
assert
tree
.
find
(
'.//tr'
)
is
not
None
,
"HTML should contain a <tr> element"
assert
tree
.
find
(
'.//td'
)
is
not
None
,
"HTML should contain a <td> element"
assert
tree
.
find
(
'.//td'
)
is
not
None
,
"HTML should contain a <td> element"
# 检查具体的表格内容
# 检查具体的表格内容
headers
=
tree
.
xpath
(
'//thead/tr/td/b'
)
headers
=
tree
.
xpath
(
'//table/tr[1]/td'
)
print
(
headers
)
# Print headers for debugging
assert
len
(
headers
)
==
5
,
"Thead should have 5 columns"
assert
len
(
headers
)
==
5
,
"Thead should have 5 columns"
assert
headers
[
0
].
text
and
headers
[
0
].
text
.
strip
()
==
"Methods"
,
"First header should be 'Methods'"
assert
headers
[
0
].
text
and
headers
[
0
].
text
.
strip
()
==
"Methods"
,
"First header should be 'Methods'"
assert
headers
[
1
].
text
and
headers
[
1
].
text
.
strip
()
==
"R"
,
"Second header should be 'R'"
assert
headers
[
1
].
text
and
headers
[
1
].
text
.
strip
()
==
"R"
,
"Second header should be 'R'"
...
@@ -35,7 +38,7 @@ class TestppTableModel(unittest.TestCase):
...
@@ -35,7 +38,7 @@ class TestppTableModel(unittest.TestCase):
assert
headers
[
4
].
text
and
headers
[
4
].
text
.
strip
()
==
"FPS"
,
"Fifth header should be 'FPS'"
assert
headers
[
4
].
text
and
headers
[
4
].
text
.
strip
()
==
"FPS"
,
"Fifth header should be 'FPS'"
# 检查第一行数据
# 检查第一行数据
first_row
=
tree
.
xpath
(
'//t
body
/tr[
1
]/td'
)
first_row
=
tree
.
xpath
(
'//t
able
/tr[
2
]/td'
)
assert
len
(
first_row
)
==
5
,
"First row should have 5 cells"
assert
len
(
first_row
)
==
5
,
"First row should have 5 cells"
assert
first_row
[
0
].
text
and
first_row
[
0
].
text
.
strip
()
==
"SegLink[26]"
,
"First cell should be 'SegLink[26]'"
assert
first_row
[
0
].
text
and
first_row
[
0
].
text
.
strip
()
==
"SegLink[26]"
,
"First cell should be 'SegLink[26]'"
assert
first_row
[
1
].
text
and
first_row
[
1
].
text
.
strip
()
==
"70.0"
,
"Second cell should be '70.0'"
assert
first_row
[
1
].
text
and
first_row
[
1
].
text
.
strip
()
==
"70.0"
,
"Second cell should be '70.0'"
...
@@ -44,14 +47,13 @@ class TestppTableModel(unittest.TestCase):
...
@@ -44,14 +47,13 @@ class TestppTableModel(unittest.TestCase):
assert
first_row
[
4
].
text
and
first_row
[
4
].
text
.
strip
()
==
"8.9"
,
"Fifth cell should be '8.9'"
assert
first_row
[
4
].
text
and
first_row
[
4
].
text
.
strip
()
==
"8.9"
,
"Fifth cell should be '8.9'"
# 检查倒数第二行数据
# 检查倒数第二行数据
second_last_row
=
tree
.
xpath
(
'//t
body
/tr[position()=last()-1]/td'
)
second_last_row
=
tree
.
xpath
(
'//t
able
/tr[position()=last()-1]/td'
)
assert
len
(
second_last_row
)
==
5
,
"second_last_row should have 5 cells"
assert
len
(
second_last_row
)
==
5
,
"second_last_row should have 5 cells"
assert
second_last_row
[
0
].
text
and
second_last_row
[
assert
second_last_row
[
0
].
text
and
second_last_row
[
0
].
text
.
strip
()
==
"Ours (SynText)"
,
"First cell should be 'Ours (SynText)'"
0
].
text
.
strip
()
==
"Ours (SynText)"
,
"First cell should be 'Ours (SynText)'"
assert
second_last_row
[
1
].
text
and
second_last_row
[
1
].
text
.
strip
()
==
"80.68"
,
"Second cell should be '80.68'"
assert
second_last_row
[
1
].
text
and
second_last_row
[
1
].
text
.
strip
()
==
"80.68"
,
"Second cell should be '80.68'"
assert
second_last_row
[
2
].
text
and
second_last_row
[
2
].
text
.
strip
()
==
"85.40"
,
"Third cell should be '85.40'"
assert
second_last_row
[
2
].
text
and
second_last_row
[
2
].
text
.
strip
()
==
"85.40"
,
"Third cell should be '85.40'"
assert
second_last_row
[
3
].
text
and
second_last_row
[
3
].
text
.
strip
()
==
"82.97"
,
"Fourth cell should be '82.97'"
#
assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
assert
second_last_row
[
3
].
text
and
second_last_row
[
4
].
text
.
strip
()
==
"12.68"
,
"Fifth cell should be '12.68'"
#
assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment