Unverified Commit 069bcfe6 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #879 from opendatalab/release-0.9.1

Release 0.9.1
parents 8ee1da82 bff7bd93
import cv2
from paddleocr.ppstructure.table.predict_table import TableSystem from paddleocr.ppstructure.table.predict_table import TableSystem
from paddleocr.ppstructure.utility import init_args from paddleocr.ppstructure.utility import init_args
from magic_pdf.libs.Constants import * from magic_pdf.libs.Constants import *
...@@ -36,12 +37,13 @@ class ppTableModel(object): ...@@ -36,12 +37,13 @@ class ppTableModel(object):
- HTML (str): A string representing the HTML structure with content of the table. - HTML (str): A string representing the HTML structure with content of the table.
""" """
if isinstance(image, Image.Image): if isinstance(image, Image.Image):
image = np.array(image) image = np.asarray(image)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
pred_res, _ = self.table_sys(image) pred_res, _ = self.table_sys(image)
pred_html = pred_res["html"] pred_html = pred_res["html"]
res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>", # res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
"") + "</table></td>\n" # "</table></body></html>","") + "</table></td>\n"
return res return pred_html
def parse_args(self, **kwargs): def parse_args(self, **kwargs):
parser = init_args() parser = init_args()
......
...@@ -63,15 +63,18 @@ def __is_list_or_index_block(block): ...@@ -63,15 +63,18 @@ def __is_list_or_index_block(block):
first_line = block['lines'][0] first_line = block['lines'][0]
line_height = first_line['bbox'][3] - first_line['bbox'][1] line_height = first_line['bbox'][3] - first_line['bbox'][1]
block_weight = block['bbox_fs'][2] - block['bbox_fs'][0] block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
left_close_num = 0 left_close_num = 0
left_not_close_num = 0 left_not_close_num = 0
right_not_close_num = 0 right_not_close_num = 0
right_close_num = 0 right_close_num = 0
lines_text_list = [] lines_text_list = []
center_close_num = 0
external_sides_not_close_num = 0
multiple_para_flag = False multiple_para_flag = False
last_line = block['lines'][-1] last_line = block['lines'][-1]
# 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格) # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
# block['bbox_fs'][2] - first_line['bbox'][2] < line_height and # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
...@@ -82,6 +85,16 @@ def __is_list_or_index_block(block): ...@@ -82,6 +85,16 @@ def __is_list_or_index_block(block):
for line in block['lines']: for line in block['lines']:
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
if (
line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
):
external_sides_not_close_num += 1
if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1
line_text = "" line_text = ""
for span in line['spans']: for span in line['spans']:
...@@ -103,7 +116,7 @@ def __is_list_or_index_block(block): ...@@ -103,7 +116,7 @@ def __is_list_or_index_block(block):
right_close_num += 1 right_close_num += 1
else: else:
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值 # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
closed_area = 0.3 * block_weight closed_area = 0.26 * block_weight
# closed_area = 5 * line_height # closed_area = 5 * line_height
if block['bbox_fs'][2] - line['bbox'][2] > closed_area: if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
right_not_close_num += 1 right_not_close_num += 1
...@@ -132,17 +145,29 @@ def __is_list_or_index_block(block): ...@@ -132,17 +145,29 @@ def __is_list_or_index_block(block):
line_num_flag = True line_num_flag = True
# 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8) if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
and line_num_flag and line_num_flag
): ):
for line in block['lines']: for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.Index return BlockType.Index
# 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
# 补充条件block的长宽比有要求
elif (
external_sides_not_close_num >= 2 and
center_close_num == len(block['lines']) and
external_sides_not_close_num / len(block['lines']) >= 0.5 and
block_height / block_weight > 0.4
):
for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.List
elif left_close_num >= 2 and ( elif left_close_num >= 2 and (
right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag: right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
# 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾 # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
if left_close_num / len(block['lines']) > 0.9: if left_close_num / len(block['lines']) > 0.8:
# 这种是每个item只有一行,且左边都贴边的短item list # 这种是每个item只有一行,且左边都贴边的短item list
if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5: if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
for line in block['lines']: for line in block['lines']:
...@@ -154,7 +179,7 @@ def __is_list_or_index_block(block): ...@@ -154,7 +179,7 @@ def __is_list_or_index_block(block):
if lines_text_list[i][-1] in LIST_END_FLAG: if lines_text_list[i][-1] in LIST_END_FLAG:
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
if i + 1 < len(block['lines']): if i + 1 < len(block['lines']):
block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
# line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
else: else:
line_start_flag = False line_start_flag = False
...@@ -162,7 +187,8 @@ def __is_list_or_index_block(block): ...@@ -162,7 +187,8 @@ def __is_list_or_index_block(block):
if line_start_flag: if line_start_flag:
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
line_start_flag = False line_start_flag = False
elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height: # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
line_start_flag = True line_start_flag = True
# 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致 # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致
......
...@@ -10,7 +10,7 @@ formats: ...@@ -10,7 +10,7 @@ formats:
python: python:
install: install:
- requirements: docs/requirements.txt - requirements: next_docs/requirements.txt
sphinx: sphinx:
configuration: docs/en/conf.py configuration: next_docs/en/conf.py
Changelog
=========
- 2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
`localized deployment version <projects/web_demo/README.md>`__ of the
`online
demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
the `front-end interface <projects/web/README.md>`__.
- 2024/09/09: Version 0.8.0 released, supporting fast deployment with
Dockerfile, and launching demos on Huggingface and Modelscope.
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table
recognition option
- 2024/08/09: Version 0.7.0b1 released, simplified installation
process, added table recognition functionality
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
issues and installation documentation
- 2024/07/05: Initial open-source release
.. warning::
fix ``localized deployment version`` and ``front-end interface``
FAQ
==========================
1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
On macOS, the default shell has switched from Bash to Z shell, which has
special handling logic for certain types of string matching. This can
lead to the “no matches found” error. You can try disabling the globbing
feature in the command line and then run the installation command again.
.. code:: bash
setopt no_nomatch
pip install magic-pdf[full]
2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This might be due to an incomplete download of the model file. You can
try re-downloading the model file and then try again. Reference:
https://github.com/opendatalab/MinerU/issues/143
3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The path for the model files is configured in “magic-pdf.json”. just
like:
.. code:: json
{
"models-dir": "/tmp/models"
}
This path is an absolute path, not a relative path. You can obtain the
absolute path in the models directory using the “pwd” command.
Reference:
https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
install the ``libgl`` library with the following command to resolve the
issue:
.. code:: bash
sudo apt-get install libgl1-mesa-glx
Reference: https://github.com/opendatalab/MinerU/issues/388
5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You need to uninstall the module and reinstall it:
.. code:: bash
pip uninstall fairscale
pip install fairscale
Reference: https://github.com/opendatalab/MinerU/issues/411
6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The compatibility of cuda11 with new graphics cards is poor, and the
CUDA version used by Paddle needs to be upgraded.
.. code:: bash
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
Reference: https://github.com/opendatalab/MinerU/issues/558
Glossary
===========
1. jsonl
TODO: add description
2. magic-pdf.json
TODO: add description
Known Issues
============
- Reading order is based on the model’s sorting of text distribution in
space, which may become disordered under extremely complex layouts.
- Vertical text is not supported.
- Tables of contents and lists are recognized through rules; a few
uncommon list formats may not be identified.
- Only one level of headings is supported; hierarchical heading levels
are currently not supported.
- Code blocks are not yet supported in the layout model.
- Comic books, art books, elementary school textbooks, and exercise
books are not well-parsed yet
- Enabling OCR may produce better results in PDFs with a high density
of formulas
- If you are processing PDFs with a large number of formulas, it is
strongly recommended to enable the OCR function. When using PyMuPDF
to extract text, overlapping text lines can occur, leading to
inaccurate formula insertion positions.
Data Api
------------------
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
api/dataset.rst api/dataset
api/data_reader_writer.rst api/data_reader_writer
api/read_api.rst api/read_api
api/schemas
api/io
api/classes
\ No newline at end of file
Class Hierarchy
===============
.. inheritance-diagram:: magic_pdf.data.io.base magic_pdf.data.io.http magic_pdf.data.io.s3
:parts: 2
.. inheritance-diagram:: magic_pdf.data.dataset
:parts: 2
.. inheritance-diagram:: magic_pdf.data.data_reader_writer.base magic_pdf.data.data_reader_writer.filebase magic_pdf.data.data_reader_writer.multi_bucket_s3
:parts: 2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment