Unverified Commit 1b35f044 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2252 from opendatalab/release-1.3.4

Release 1.3.4
parents 8f3c1780 0222293f
...@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte ...@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div> </div>
# Changelog # Changelog
- 2025/04/16 1.3.4 Released
- Slightly improved the speed of OCR detection by removing some unused blocks.
- Fixed page-level sorting errors caused by footnotes in certain cases.
- 2025/04/12 1.3.2 released - 2025/04/12 1.3.2 released
- Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems. - Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
- Optimized memory usage during batch inference. - Optimized memory usage during batch inference.
......
...@@ -47,6 +47,9 @@ ...@@ -47,6 +47,9 @@
</div> </div>
# 更新记录 # 更新记录
- 2025/04/16 1.3.4 发布
- 通过移除一些无用的块,小幅提升了ocr-det的速度
- 修复部分情况下由footnote导致的页面内排序错误
- 2025/04/12 1.3.2 发布 - 2025/04/12 1.3.2 发布
- 修复了windows系统下,在python3.13环境安装时一些依赖包版本不兼容的问题 - 修复了windows系统下,在python3.13环境安装时一些依赖包版本不兼容的问题
- 优化批量推理时的内存占用 - 优化批量推理时的内存占用
......
...@@ -18,7 +18,17 @@ RUN apt-get update && \ ...@@ -18,7 +18,17 @@ RUN apt-get update && \
wget \ wget \
git \ git \
libgl1 \ libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \ libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3 # Set Python 3.10 as the default python3
......
...@@ -18,7 +18,17 @@ RUN apt-get update && \ ...@@ -18,7 +18,17 @@ RUN apt-get update && \
wget \ wget \
git \ git \
libgl1 \ libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \ libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3 # Set Python 3.10 as the default python3
......
...@@ -2,6 +2,8 @@ import time ...@@ -2,6 +2,8 @@ import time
import torch import torch
from loguru import logger from loguru import logger
import numpy as np import numpy as np
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.clean_memory import clean_memory from magic_pdf.libs.clean_memory import clean_memory
...@@ -188,9 +190,46 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0 ...@@ -188,9 +190,46 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx] return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx]
def remove_overlaps_min_blocks(res_list):
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
need_remove = []
for res1 in res_list:
for res2 in res_list:
if res1 != res2:
overlap_box = get_minbox_if_overlap_by_ratio(
res1['bbox'], res2['bbox'], 0.8
)
if overlap_box is not None:
res_to_remove = next(
(res for res in res_list if res['bbox'] == overlap_box),
None,
)
if (
res_to_remove is not None
and res_to_remove not in need_remove
):
large_res = res1 if res1 != res_to_remove else res2
x1, y1, x2, y2 = large_res['bbox']
sx1, sy1, sx2, sy2 = res_to_remove['bbox']
x1 = min(x1, sx1)
y1 = min(y1, sy1)
x2 = max(x2, sx2)
y2 = max(y2, sy2)
large_res['bbox'] = [x1, y1, x2, y2]
need_remove.append(res_to_remove)
if len(need_remove) > 0:
for res in need_remove:
res_list.remove(res)
return res_list, need_remove
def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8): def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8):
"""Extract OCR, table and other regions from layout results.""" """Extract OCR, table and other regions from layout results."""
ocr_res_list = [] ocr_res_list = []
text_res_list = []
table_res_list = [] table_res_list = []
table_indices = [] table_indices = []
single_page_mfdetrec_res = [] single_page_mfdetrec_res = []
...@@ -204,11 +243,14 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol ...@@ -204,11 +243,14 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
"bbox": [int(res['poly'][0]), int(res['poly'][1]), "bbox": [int(res['poly'][0]), int(res['poly'][1]),
int(res['poly'][4]), int(res['poly'][5])], int(res['poly'][4]), int(res['poly'][5])],
}) })
elif category_id in [0, 1, 2, 4, 6, 7]: # OCR regions elif category_id in [0, 2, 4, 6, 7]: # OCR regions
ocr_res_list.append(res) ocr_res_list.append(res)
elif category_id == 5: # Table regions elif category_id == 5: # Table regions
table_res_list.append(res) table_res_list.append(res)
table_indices.append(i) table_indices.append(i)
elif category_id in [1]: # Text regions
res['bbox'] = [int(res['poly'][0]), int(res['poly'][1]), int(res['poly'][4]), int(res['poly'][5])]
text_res_list.append(res)
# Process tables: merge high IoU tables first, then filter nested tables # Process tables: merge high IoU tables first, then filter nested tables
table_res_list, table_indices = merge_high_iou_tables( table_res_list, table_indices = merge_high_iou_tables(
...@@ -226,6 +268,22 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol ...@@ -226,6 +268,22 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
for idx in sorted(to_remove, reverse=True): for idx in sorted(to_remove, reverse=True):
del layout_res[idx] del layout_res[idx]
# Remove overlaps in OCR and text regions
text_res_list, need_remove = remove_overlaps_min_blocks(text_res_list)
for res in text_res_list:
# 将res的poly使用bbox重构
res['poly'] = [res['bbox'][0], res['bbox'][1], res['bbox'][2], res['bbox'][1],
res['bbox'][2], res['bbox'][3], res['bbox'][0], res['bbox'][3]]
# 删除res的bbox
del res['bbox']
ocr_res_list.extend(text_res_list)
if len(need_remove) > 0:
for res in need_remove:
del res['bbox']
layout_res.remove(res)
return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res
......
...@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h): ...@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
return [[x0, y0, x1, y1]] return [[x0, y0, x1, y1]]
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
page_line_list = [] page_line_list = []
def add_lines_to_block(b): def add_lines_to_block(b):
...@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
block['real_lines'] = copy.deepcopy(block['lines']) block['real_lines'] = copy.deepcopy(block['lines'])
add_lines_to_block(block) add_lines_to_block(block)
for block in footnote_blocks:
footnote_block = {'bbox': block[:4]}
add_lines_to_block(footnote_block)
if len(page_line_list) > 200: # layoutreader最高支持512line if len(page_line_list) > 200: # layoutreader最高支持512line
return None return None
...@@ -779,7 +783,7 @@ def parse_page_core( ...@@ -779,7 +783,7 @@ def parse_page_core(
# interline_equation_blocks参数不够准,后面切换到interline_equations上 # interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks = [] interline_equation_blocks = []
if len(interline_equation_blocks) > 0: if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks, img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, discarded_blocks,
...@@ -790,7 +794,7 @@ def parse_page_core( ...@@ -790,7 +794,7 @@ def parse_page_core(
page_h, page_h,
) )
else: else:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks, img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, discarded_blocks,
...@@ -866,7 +870,7 @@ def parse_page_core( ...@@ -866,7 +870,7 @@ def parse_page_core(
line_height = get_line_height(fix_blocks) line_height = get_line_height(fix_blocks)
"""获取所有line并对line排序""" """获取所有line并对line排序"""
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height) sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks)
"""根据line的中位数算block的序列关系""" """根据line的中位数算block的序列关系"""
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes) fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
......
...@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2( ...@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
"""将剩余的bbox做分离处理,防止后面分layout时出错""" """将剩余的bbox做分离处理,防止后面分layout时出错"""
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes.sort(key=lambda x: x[0]+x[1]) all_bboxes.sort(key=lambda x: x[0]+x[1])
return all_bboxes, all_discarded_blocks return all_bboxes, all_discarded_blocks, footnote_blocks
def find_blocks_under_footnote(all_bboxes, footnote_blocks): def find_blocks_under_footnote(all_bboxes, footnote_blocks):
......
import os import os
import subprocess import subprocess
import platform
from pathlib import Path from pathlib import Path
import shutil
class ConvertToPdfError(Exception): class ConvertToPdfError(Exception):
...@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception): ...@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception):
super().__init__(self.msg) super().__init__(self.msg)
# Chinese font list
REQUIRED_CHS_FONTS = ['SimSun', 'Microsoft YaHei', 'Noto Sans CJK SC']
def check_fonts_installed():
"""Check if required Chinese fonts are installed."""
system_type = platform.system()
if system_type == 'Windows':
# Windows: check fonts via registry or system font folder
font_dir = Path("C:/Windows/Fonts")
installed_fonts = [f.name for f in font_dir.glob("*.ttf")]
if any(font for font in REQUIRED_CHS_FONTS if any(font in f for f in installed_fonts)):
return True
raise EnvironmentError(
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
)
else:
# Linux/macOS: use fc-list
try:
output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
for font in REQUIRED_CHS_FONTS:
if font in output:
return True
raise EnvironmentError(
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
)
except Exception as e:
raise EnvironmentError(f"Font detection failed. Please install 'fontconfig' and fonts: {str(e)}")
def get_soffice_command():
"""Return the path to LibreOffice's soffice executable depending on the platform."""
system_type = platform.system()
# First check if soffice is in PATH
soffice_path = shutil.which('soffice')
if soffice_path:
return soffice_path
if system_type == 'Windows':
# Check common installation paths
possible_paths = [
Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
Path('C:/Program Files/LibreOffice/program/soffice.exe'),
Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
]
# Check other drives for windows
for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
for path in possible_paths:
if path.exists():
return str(path)
raise ConvertToPdfError(
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
"or ensure soffice.exe is in your PATH environment variable."
)
else:
# For Linux/macOS, provide installation instructions if not found
try:
# Try to find soffice in standard locations
possible_paths = [
'/usr/bin/soffice',
'/usr/local/bin/soffice',
'/opt/libreoffice/program/soffice',
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
]
for path in possible_paths:
if os.path.exists(path):
return path
raise ConvertToPdfError(
"LibreOffice not found. Please install it:\n"
" - Ubuntu/Debian: sudo apt-get install libreoffice\n"
" - CentOS/RHEL: sudo yum install libreoffice\n"
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
" - Or ensure soffice is in your PATH environment variable."
)
except Exception as e:
raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
def convert_file_to_pdf(input_path, output_dir): def convert_file_to_pdf(input_path, output_dir):
"""Convert a single document (ppt, doc, etc.) to PDF."""
if not os.path.isfile(input_path): if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.") raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
check_fonts_installed()
soffice_cmd = get_soffice_command()
cmd = [ cmd = [
'soffice', soffice_cmd,
'--headless', '--headless',
'--norestore',
'--invisible',
'--convert-to', 'pdf', '--convert-to', 'pdf',
'--outdir', str(output_dir), '--outdir', str(output_dir),
str(input_path) str(input_path)
] ]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0: if process.returncode != 0:
raise ConvertToPdfError(process.stderr.decode()) raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")
...@@ -2,31 +2,34 @@ import unittest ...@@ -2,31 +2,34 @@ import unittest
from PIL import Image from PIL import Image
from lxml import etree from lxml import etree
from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
class TestppTableModel(unittest.TestCase): class TestppTableModel(unittest.TestCase):
def test_image2html(self): def test_image2html(self):
img = Image.open("tests/unittest/test_table/assets/table.jpg") img = Image.open("assets/table.jpg")
# 修改table模型路径 atom_model_manager = AtomModelSingleton()
config = {"device": "cuda", ocr_engine = atom_model_manager.get_atom_model(
"model_dir": "/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"} atom_model_name='ocr',
table_model = TableMasterPaddleModel(config) ocr_show_log=False,
res = table_model.img2html(img) det_db_box_thresh=0.5,
det_db_unclip_ratio=1.6,
lang='ch'
)
table_model = RapidTableModel(ocr_engine, 'slanet_plus')
html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(img)
# 验证生成的 HTML 是否符合预期 # 验证生成的 HTML 是否符合预期
parser = etree.HTMLParser() parser = etree.HTMLParser()
tree = etree.fromstring(res, parser) tree = etree.fromstring(html_code, parser)
# 检查 HTML 结构 # 检查 HTML 结构
assert tree.find('.//table') is not None, "HTML should contain a <table> element" assert tree.find('.//table') is not None, "HTML should contain a <table> element"
assert tree.find('.//thead') is not None, "HTML should contain a <thead> element"
assert tree.find('.//tbody') is not None, "HTML should contain a <tbody> element"
assert tree.find('.//tr') is not None, "HTML should contain a <tr> element" assert tree.find('.//tr') is not None, "HTML should contain a <tr> element"
assert tree.find('.//td') is not None, "HTML should contain a <td> element" assert tree.find('.//td') is not None, "HTML should contain a <td> element"
# 检查具体的表格内容 # 检查具体的表格内容
headers = tree.xpath('//thead/tr/td/b') headers = tree.xpath('//table/tr[1]/td')
print(headers) # Print headers for debugging
assert len(headers) == 5, "Thead should have 5 columns" assert len(headers) == 5, "Thead should have 5 columns"
assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'" assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'"
assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'" assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'"
...@@ -35,7 +38,7 @@ class TestppTableModel(unittest.TestCase): ...@@ -35,7 +38,7 @@ class TestppTableModel(unittest.TestCase):
assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'" assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'"
# 检查第一行数据 # 检查第一行数据
first_row = tree.xpath('//tbody/tr[1]/td') first_row = tree.xpath('//table/tr[2]/td')
assert len(first_row) == 5, "First row should have 5 cells" assert len(first_row) == 5, "First row should have 5 cells"
assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'" assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'" assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
...@@ -44,14 +47,13 @@ class TestppTableModel(unittest.TestCase): ...@@ -44,14 +47,13 @@ class TestppTableModel(unittest.TestCase):
assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'" assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'"
# 检查倒数第二行数据 # 检查倒数第二行数据
second_last_row = tree.xpath('//tbody/tr[position()=last()-1]/td') second_last_row = tree.xpath('//table/tr[position()=last()-1]/td')
assert len(second_last_row) == 5, "second_last_row should have 5 cells" assert len(second_last_row) == 5, "second_last_row should have 5 cells"
assert second_last_row[0].text and second_last_row[ assert second_last_row[0].text and second_last_row[0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'" assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'"
assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'" assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'"
assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'" # assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'" # assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment