Merge pull request #879 from opendatalab/release-0.9.1

Release 0.9.1

Merge pull request #879 from opendatalab/release-0.9.1
Release 0.9.1
069bcfe6 · Xiaomeng Zhao · GitHub · 8ee1da82 · bff7bd93 · 069bcfe6
Unverified Commit 069bcfe6 authored Nov 06, 2024 by Xiaomeng Zhao Committed by GitHub Nov 06, 2024
20 changed files
--- a/magic_pdf/model/ppTableModel.py
+++ b/magic_pdf/model/ppTableModel.py
+import cv2
 from paddleocr.ppstructure.table.predict_table import TableSystem
 from paddleocr.ppstructure.utility import init_args
 from magic_pdf.libs.Constants import *
@@ -36,12 +37,13 @@ class ppTableModel(object):
        - HTML (str): A string representing the HTML structure with content of the table.
        """
        if isinstance(image, Image.Image):
-            image = np.array(image)
+            image = np.asarray(image)
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        pred_res, _ = self.table_sys(image)
        pred_html = pred_res["html"]
-        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
-                                                                                               "") + "</table></td>\n"
-        return res
+        # res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace(
+        # "</table></body></html>","") + "</table></td>\n"
+        return pred_html

    def parse_args(self, **kwargs):
        parser = init_args()

--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
@@ -63,15 +63,18 @@ def __is_list_or_index_block(block):
        first_line = block['lines'][0]
        line_height = first_line['bbox'][3] - first_line['bbox'][1]
        block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
+        block_height = block['bbox_fs'][3] - block['bbox_fs'][1]

        left_close_num = 0
        left_not_close_num = 0
        right_not_close_num = 0
        right_close_num = 0
        lines_text_list = []
-
+        center_close_num = 0
+        external_sides_not_close_num = 0
        multiple_para_flag = False
        last_line = block['lines'][-1]
+
        # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 （第一行可能可以右边不顶格）
        if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
                # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
@@ -82,6 +85,16 @@ def __is_list_or_index_block(block):

        for line in block['lines']:

+            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
+            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
+            if (
+                    line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
+                    block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
+            ):
+                external_sides_not_close_num += 1
+            if abs(line_mid_x - block_mid_x) < line_height / 2:
+                center_close_num += 1
+
            line_text = ""

            for span in line['spans']:
@@ -103,7 +116,7 @@ def __is_list_or_index_block(block):
                right_close_num += 1
            else:
                # 右侧不顶格情况下是否有一段距离，拍脑袋用0.3block宽度做阈值
-                closed_area = 0.3 * block_weight
+                closed_area = 0.26 * block_weight
                # closed_area = 5 * line_height
                if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
                    right_not_close_num += 1
@@ -132,17 +145,29 @@ def __is_list_or_index_block(block):
                line_num_flag = True

        # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边，且符合数字规则极为index
-        if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
+        if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
                and line_num_flag
        ):
            for line in block['lines']:
                line[ListLineTag.IS_LIST_START_LINE] = True
            return BlockType.Index

+        # 全部line都居中的特殊list识别，每行都需要换行，特征是多行，且大多数行都前后not_close,每line中点x坐标接近
+        # 补充条件block的长宽比有要求
+        elif (
+                external_sides_not_close_num >= 2 and
+                center_close_num == len(block['lines']) and
+                external_sides_not_close_num / len(block['lines']) >= 0.5 and
+                block_height / block_weight > 0.4
+        ):
+            for line in block['lines']:
+                line[ListLineTag.IS_LIST_START_LINE] = True
+            return BlockType.List
+
        elif left_close_num >= 2 and (
                right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
            # 处理一种特殊的没有缩进的list，所有行都贴左边，通过右边的空隙判断是否是item尾
-            if left_close_num / len(block['lines']) > 0.9:
+            if left_close_num / len(block['lines']) > 0.8:
                # 这种是每个item只有一行，且左边都贴边的短item list
                if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
                    for line in block['lines']:
@@ -154,7 +179,7 @@ def __is_list_or_index_block(block):
                        if lines_text_list[i][-1] in LIST_END_FLAG:
                            line[ListLineTag.IS_LIST_END_LINE] = True
                            if i + 1 < len(block['lines']):
-                                block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
+                                block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
                # line item基本没有结束标识符，而且也没有缩进，按右侧空隙判断哪些是item end
                else:
                    line_start_flag = False
@@ -162,7 +187,8 @@ def __is_list_or_index_block(block):
                        if line_start_flag:
                            line[ListLineTag.IS_LIST_START_LINE] = True
                            line_start_flag = False
-                        elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                        # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                        if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
                            line[ListLineTag.IS_LIST_END_LINE] = True
                            line_start_flag = True
            # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头，end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致

--- a/next_docs/en/.readthedocs.yaml
+++ b/next_docs/en/.readthedocs.yaml
@@ -10,7 +10,7 @@ formats:

 python:
  install:
-    - requirements: docs/requirements.txt
+    - requirements: next_docs/requirements.txt

 sphinx:
-  configuration: docs/en/conf.py
+  configuration: next_docs/en/conf.py
--- a/next_docs/en/_static/image/MinerU-logo-hq.png
+++ b/next_docs/en/_static/image/MinerU-logo-hq.png
--- a/next_docs/en/_static/image/MinerU-logo.png
+++ b/next_docs/en/_static/image/MinerU-logo.png
--- a/next_docs/en/_static/image/datalab_logo.png
+++ b/next_docs/en/_static/image/datalab_logo.png
--- a/next_docs/en/_static/image/flowchart_en.png
+++ b/next_docs/en/_static/image/flowchart_en.png
--- a/next_docs/en/_static/image/flowchart_zh_cn.png
+++ b/next_docs/en/_static/image/flowchart_zh_cn.png
--- a/next_docs/en/_static/image/layout_example.png
+++ b/next_docs/en/_static/image/layout_example.png
--- a/next_docs/en/_static/image/poly.png
+++ b/next_docs/en/_static/image/poly.png
--- a/next_docs/en/_static/image/project_panorama_en.png
+++ b/next_docs/en/_static/image/project_panorama_en.png
--- a/next_docs/en/_static/image/project_panorama_zh_cn.png
+++ b/next_docs/en/_static/image/project_panorama_zh_cn.png
--- a/next_docs/en/_static/image/spans_example.png
+++ b/next_docs/en/_static/image/spans_example.png
--- a/next_docs/en/_static/image/web_demo_1.png
+++ b/next_docs/en/_static/image/web_demo_1.png
--- a/next_docs/en/additional_notes/changelog.rst
+++ b/next_docs/en/additional_notes/changelog.rst
+
+
+Changelog
+=========
+
+-  2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
+   `localized deployment version <projects/web_demo/README.md>`__ of the
+   `online
+   demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
+   the `front-end interface <projects/web/README.md>`__.
+-  2024/09/09: Version 0.8.0 released, supporting fast deployment with
+   Dockerfile, and launching demos on Huggingface and Modelscope.
+-  2024/08/30: Version 0.7.1 released, add paddle tablemaster table
+   recognition option
+-  2024/08/09: Version 0.7.0b1 released, simplified installation
+   process, added table recognition functionality
+-  2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
+   issues and installation documentation
+-  2024/07/05: Initial open-source release
+
+
+.. warning::
+
+   fix ``localized deployment version`` and ``front-end interface``
+
+
--- a/next_docs/en/additional_notes/faq.rst
+++ b/next_docs/en/additional_notes/faq.rst
+FAQ
+==========================
+
+1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+On macOS, the default shell has switched from Bash to Z shell, which has
+special handling logic for certain types of string matching. This can
+lead to the “no matches found” error. You can try disabling the globbing
+feature in the command line and then run the installation command again.
+
+.. code:: bash
+
+   setopt no_nomatch
+   pip install magic-pdf[full]
+
+2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This might be due to an incomplete download of the model file. You can
+try re-downloading the model file and then try again. Reference:
+https://github.com/opendatalab/MinerU/issues/143
+
+3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The path for the model files is configured in “magic-pdf.json”. just
+like:
+
+.. code:: json
+
+   {
+     "models-dir": "/tmp/models"
+   }
+
+This path is an absolute path, not a relative path. You can obtain the
+absolute path in the models directory using the “pwd” command.
+Reference:
+https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
+
+4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
+install the ``libgl`` library with the following command to resolve the
+issue:
+
+.. code:: bash
+
+   sudo apt-get install libgl1-mesa-glx
+
+Reference: https://github.com/opendatalab/MinerU/issues/388
+
+5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You need to uninstall the module and reinstall it:
+
+.. code:: bash
+
+   pip uninstall fairscale
+   pip install fairscale
+
+Reference: https://github.com/opendatalab/MinerU/issues/411
+
+6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The compatibility of cuda11 with new graphics cards is poor, and the
+CUDA version used by Paddle needs to be upgraded.
+
+.. code:: bash
+
+   pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
+
+Reference: https://github.com/opendatalab/MinerU/issues/558
--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
+
+
+Glossary 
+===========
+
+1. jsonl 
+    TODO: add description
+
+2. magic-pdf.json
+    TODO: add description
+
--- a/next_docs/en/additional_notes/known_issues.rst
+++ b/next_docs/en/additional_notes/known_issues.rst
+Known Issues
+============
+
+-  Reading order is based on the model’s sorting of text distribution in
+   space, which may become disordered under extremely complex layouts.
+-  Vertical text is not supported.
+-  Tables of contents and lists are recognized through rules; a few
+   uncommon list formats may not be identified.
+-  Only one level of headings is supported; hierarchical heading levels
+   are currently not supported.
+-  Code blocks are not yet supported in the layout model.
+-  Comic books, art books, elementary school textbooks, and exercise
+   books are not well-parsed yet
+-  Enabling OCR may produce better results in PDFs with a high density
+   of formulas
+-  If you are processing PDFs with a large number of formulas, it is
+   strongly recommended to enable the OCR function. When using PyMuPDF
+   to extract text, overlapping text lines can occur, leading to
+   inaccurate formula insertion positions.
--- a/next_docs/en/api.rst
+++ b/next_docs/en/api.rst
-Data Api
------------------

 .. toctree::
   :maxdepth: 2

-   api/dataset.rst
-   api/data_reader_writer.rst
-   api/read_api.rst
+   api/dataset
+   api/data_reader_writer
+   api/read_api
+   api/schemas
+   api/io
+   api/classes
\ No newline at end of file
--- a/next_docs/en/api/classes.rst
+++ b/next_docs/en/api/classes.rst
+Class Hierarchy
+===============
+
+.. inheritance-diagram:: magic_pdf.data.io.base magic_pdf.data.io.http magic_pdf.data.io.s3
+   :parts: 2
+
+
+.. inheritance-diagram:: magic_pdf.data.dataset
+   :parts: 2
+
+
+.. inheritance-diagram:: magic_pdf.data.data_reader_writer.base magic_pdf.data.data_reader_writer.filebase magic_pdf.data.data_reader_writer.multi_bucket_s3
+   :parts: 2
+