Merge pull request #976 from opendatalab/dev

Dev to 0.9.3

Merge pull request #976 from opendatalab/dev
Dev to 0.9.3
71a9a955 · Xiaomeng Zhao · GitHub · 6083e109 · 8061dfce · 71a9a955
Unverified Commit 71a9a955 authored Nov 16, 2024 by Xiaomeng Zhao Committed by GitHub Nov 16, 2024
12 changed files
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
 - Preserve the structure of the original document, including headings, paragraphs, lists, etc.
 - Extract images, image descriptions, tables, table titles, and footnotes.
 - Automatically recognize and convert formulas in the document to LaTeX format.
- Automatically recognize and convert tables in the document to LaTeX or HTML format.
+- Automatically recognize and convert tables in the document to HTML format.
 - Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
 - OCR supports detection and recognition of 84 languages.
 - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
@@ -185,17 +185,11 @@ There are three different ways to experience MinerU:
    </tr>
    <tr>
        <td rowspan="2">GPU Hardware Support List</td>
-        <td colspan="2">Minimum Requirement 8G+ VRAM</td>
-        <td colspan="2">3060ti/3070/4060<br>
-        8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
+        <td colspan="2">GPU VRAM 8GB or more</td>
+        <td colspan="2">2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
+        8G VRAM can enable all acceleration features</td>
        <td rowspan="2">None</td>
    </tr>
-    <tr>
-        <td colspan="2">Recommended Configuration 10G+ VRAM</td>
-        <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
-        10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
-        </td>
-    </tr>
 </table>

 ### Online Demo
@@ -247,7 +241,7 @@ You can modify certain configurations in this file to enable or disable features
        "enable": true  // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
    },
    "table-config": {
-        "model": "rapid_table",  // When using structEqTable, please change to "struct_eqtable".
+        "model": "rapid_table",  // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
        "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
        "max_time": 400
    }

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
 - 保留原文档的结构，包括标题、段落、列表等
 - 提取图像、图片描述、表格、表格标题及脚注
 - 自动识别并转换文档中的公式为LaTeX格式
- 自动识别并转换文档中的表格为LaTeX或HTML格式
+- 自动识别并转换文档中的表格为HTML格式
 - 自动检测扫描版PDF和乱码PDF，并启用OCR功能
 - OCR支持84种语言的检测与识别
 - 支持多种输出格式，如多模态与NLP的Markdown、按阅读顺序排序的JSON、含有丰富信息的中间格式等
@@ -186,17 +186,13 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
    </tr>
    <tr>
        <td rowspan="2">GPU硬件支持列表</td>
-        <td colspan="2">最低要求 8G+显存</td>
-        <td colspan="2">3060ti/3070/4060<br>
-        8G显存可开启全部加速功能(表格仅限rapid_table)</td>
+        <td colspan="2">显存8G以上</td>
+        <td colspan="2">
+        2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
+        8G显存及以上可开启全部加速功能</td>
        <td rowspan="2">None</td>
    </tr>
-    <tr>
-        <td colspan="2">推荐配置 10G+显存</td>
-        <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
-        10G显存及以上可开启全部加速功能<br>
-        </td>
-    </tr>
+
 </table>

 ### 在线体验
@@ -251,7 +247,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
        "enable": true  // 公式识别功能默认是开启的，如果需要关闭请修改此处的值为"false"
    },
    "table-config": {
-        "model": "rapid_table",  // 使用structEqTable请修改为"struct_eqtable"
+        "model": "rapid_table",  // 默认使用"rapid_table",可以切换为"tablemaster"和"struct_eqtable"
        "enable": false, // 表格识别功能默认是关闭的，如果需要开启请修改此处的值为"true"
        "max_time": 400
    }

--- a/projects/web_demo/web_demo/api/analysis/pdf_ext.py
+++ b/projects/web_demo/web_demo/api/analysis/pdf_ext.py
 import json
 import re
+import os
+import shutil
 import traceback
 from pathlib import Path
 from flask import current_app, url_for
@@ -7,7 +9,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.pipe.UNIPipe import UNIPipe
 import magic_pdf.model as model_config
 from magic_pdf.libs.json_compressor import JsonCompressor
-from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para_and_pagination
+from common.mk_markdown.mk_markdown import ocr_mk_mm_markdown_with_para_and_pagination
 from .ext import find_file
 from ..extentions import app, db
 from .models import AnalysisPdf, AnalysisTask
@@ -17,7 +19,7 @@ from loguru import logger
 model_config.__use_inside_model__ = True


-def analysis_pdf(image_dir, pdf_bytes, is_ocr=False):
+def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
    try:
        model_json = []  # model_json传空list使用内置模型解析
        logger.info(f"is_ocr: {is_ocr}")
@@ -40,7 +42,7 @@ def analysis_pdf(image_dir, pdf_bytes, is_ocr=False):
        pipe.pipe_parse()
        pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
        pdf_info_list = pdf_mid_data["pdf_info"]
-        md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_dir),
+        md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
                                ensure_ascii=False)
        bbox_info = get_bbox_info(pdf_info_list)
        return md_content, bbox_info
@@ -77,20 +79,22 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
        logger.info(f"image_dir: {image_dir}")
        if not Path(image_dir).exists():
            Path(image_dir).mkdir(parents=True, exist_ok=True)
+        else:
+            # 清空image_dir，避免同文件多次解析图片积累
+            shutil.rmtree(image_dir, ignore_errors=True)
+            os.makedirs(image_dir, exist_ok=True)
+
+        # 获取文件内容
        with open(pdf_path, 'rb') as file:
            pdf_bytes = file.read()
-        md_content, bbox_info = analysis_pdf(image_dir, pdf_bytes, is_ocr)
-        img_list = Path(image_dir).glob('*') if Path(image_dir).exists() else []
+        # 生成图片链接
+        with app.app_context():
+            image_url_prefix = f"http://{current_app.config['SERVER_NAME']}{current_app.config['FILE_API']}&pdf={Path(pdf_path).name}&filename="
+        # 解析文件
+        md_content, bbox_info = analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr)

+        # ############ markdown #############
        pdf_name = Path(pdf_path).name
-        with app.app_context():
-            for img in img_list:
-                img_name = Path(img).name
-                regex = re.compile(fr'.*\((.*?{img_name})')
-                regex_result = regex.search(md_content)
-                if regex_result:
-                    img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False)
-                    md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}")

        full_md_content = ""
        for item in json.loads(md_content):

--- a/projects/web_demo/web_demo/app.py
+++ b/projects/web_demo/web_demo/app.py
@@ -42,7 +42,7 @@ if database:
 ip_address = get_local_ip()
 port = config.get("PORT", 5559)
 # 配置 SERVER_NAME
-config['SERVER_NAME'] = f'{ip_address}:5559'
+config['SERVER_NAME'] = f'{ip_address}:{port}'
 # 配置 APPLICATION_ROOT
 config['APPLICATION_ROOT'] = '/'
 # 配置 PREFERRED_URL_SCHEME

--- a/projects/web_demo/web_demo/common/mk_markdown/__init__.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/__init__.py
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/language.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/language.py
+import os
+import unicodedata
+
+if not os.getenv("FTLANG_CACHE"):
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    root_dir = os.path.dirname(current_dir)
+    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
+    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
+    # print(os.getenv("FTLANG_CACHE"))
+
+from fast_langdetect import detect_language
+
+
+def detect_lang(text: str) -> str:
+
+    if len(text) == 0:
+        return ""
+    try:
+        lang_upper = detect_language(text)
+    except:
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_language(html_no_ctrl_chars)
+    try:
+        lang = lang_upper.lower()
+    except:
+        lang = ""
+    return lang
+
+
+if __name__ == '__main__':
+    print(os.getenv("FTLANG_CACHE"))
+    print(detect_lang("This is a test."))
+    print(detect_lang("<html>This is a test</html>"))
+    print(detect_lang("这个是中文测试。"))
+    print(detect_lang("<html>这个是中文测试。</html>"))
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
+import re
+
+
+def escape_special_markdown_char(pymu_blocks):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for blk in pymu_blocks:
+        for line in blk['lines']:
+            for span in line['spans']:
+                for char in special_chars:
+                    span_text = span['text']
+                    span_type = span.get("_type", None)
+                    if span_type in ['inline-equation', 'interline-equation']:
+                        continue
+                    elif span_text:
+                        span['text'] = span['text'].replace(char, "\\" + char)
+
+    return pymu_blocks
+
+
+def ocr_escape_special_markdown_char(content):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for char in special_chars:
+        content = content.replace(char, "\\" + char)
+
+    return content
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
+class ContentType:
+    Image = 'image'
+    Table = 'table'
+    Text = 'text'
+    InlineEquation = 'inline_equation'
+    InterlineEquation = 'interline_equation'
+
+
+class BlockType:
+    Image = 'image'
+    ImageBody = 'image_body'
+    ImageCaption = 'image_caption'
+    ImageFootnote = 'image_footnote'
+    Table = 'table'
+    TableBody = 'table_body'
+    TableCaption = 'table_caption'
+    TableFootnote = 'table_footnote'
+    Text = 'text'
+    Title = 'title'
+    InterlineEquation = 'interline_equation'
+    Footnote = 'footnote'
+    Discarded = 'discarded'
+
+
+class CategoryId:
+    Title = 0
+    Text = 1
+    Abandon = 2
+    ImageBody = 3
+    ImageCaption = 4
+    TableBody = 5
+    TableCaption = 6
+    TableFootnote = 7
+    InterlineEquation_Layout = 8
+    InlineEquation = 13
+    InterlineEquation_YOLO = 14
+    OcrText = 15
+    ImageFootnote = 101
--- a/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
+import re
+import wordninja
+from .libs.language import detect_lang
+from .libs.markdown_utils import ocr_escape_special_markdown_char
+from .libs.ocr_content_type import BlockType, ContentType
+
+
+def __is_hyphen_at_line_end(line):
+    """
+    Check if a line ends with one or more letters followed by a hyphen.
+
+    Args:
+    line (str): The line of text to check.
+
+    Returns:
+    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
+    """
+    # Use regex to check if the line ends with one or more letters followed by a hyphen
+    return bool(re.search(r'[A-Za-z]+-\s*$', line))
+
+
+def split_long_words(text):
+    segments = text.split(' ')
+    for i in range(len(segments)):
+        words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
+        for j in range(len(words)):
+            if len(words[j]) > 10:
+                words[j] = ' '.join(wordninja.split(words[j]))
+        segments[i] = ''.join(words)
+    return ' '.join(segments)
+
+
+def join_path(*args):
+    return ''.join(str(s).rstrip('/') for s in args)
+
+
+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
+                                                img_buket_path):
+    markdown_with_para_and_pagination = []
+    page_no = 0
+    for page_info in pdf_info_dict:
+        paras_of_layout = page_info.get('para_blocks')
+        if not paras_of_layout:
+            continue
+        page_markdown = ocr_mk_markdown_with_para_core_v2(
+            paras_of_layout, 'mm', img_buket_path)
+        markdown_with_para_and_pagination.append({
+            'page_no':
+                page_no,
+            'md_content':
+                '\n\n'.join(page_markdown)
+        })
+        page_no += 1
+    return markdown_with_para_and_pagination
+
+
+def merge_para_with_text(para_block):
+    def detect_language(text):
+        en_pattern = r'[a-zA-Z]+'
+        en_matches = re.findall(en_pattern, text)
+        en_length = sum(len(match) for match in en_matches)
+        if len(text) > 0:
+            if en_length / len(text) >= 0.5:
+                return 'en'
+            else:
+                return 'unknown'
+        else:
+            return 'empty'
+
+    para_text = ''
+    for line in para_block['lines']:
+        line_text = ''
+        line_lang = ''
+        for span in line['spans']:
+            span_type = span['type']
+            if span_type == ContentType.Text:
+                line_text += span['content'].strip()
+        if line_text != '':
+            line_lang = detect_lang(line_text)
+        for span in line['spans']:
+            span_type = span['type']
+            content = ''
+            if span_type == ContentType.Text:
+                content = span['content']
+                # language = detect_lang(content)
+                language = detect_language(content)
+                if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
+                    content = ocr_escape_special_markdown_char(
+                        split_long_words(content))
+                else:
+                    content = ocr_escape_special_markdown_char(content)
+            elif span_type == ContentType.InlineEquation:
+                content = f" ${span['content']}$ "
+            elif span_type == ContentType.InterlineEquation:
+                content = f"\n$$\n{span['content']}\n$$\n"
+
+            if content != '':
+                langs = ['zh', 'ja', 'ko']
+                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
+                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
+                elif line_lang == 'en':
+                    # 如果是前一行带有-连字符，那么末尾不应该加空格
+                    if __is_hyphen_at_line_end(content):
+                        para_text += content[:-1]
+                    else:
+                        para_text += content + ' '
+                else:
+                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
+    return para_text
+
+
+def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
+                                      mode,
+                                      img_buket_path=''):
+    page_markdown = []
+    for para_block in paras_of_layout:
+        para_text = ''
+        para_type = para_block['type']
+        if para_type == BlockType.Text:
+            para_text = merge_para_with_text(para_block)
+        elif para_type == BlockType.Title:
+            para_text = f'# {merge_para_with_text(para_block)}'
+        elif para_type == BlockType.InterlineEquation:
+            para_text = merge_para_with_text(para_block)
+        elif para_type == BlockType.Image:
+            if mode == 'nlp':
+                continue
+            elif mode == 'mm':
+                for block in para_block['blocks']:  # 1st.拼image_body
+                    if block['type'] == BlockType.ImageBody:
+                        for line in block['lines']:
+                            for span in line['spans']:
+                                if span['type'] == ContentType.Image:
+                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                for block in para_block['blocks']:  # 2nd.拼image_caption
+                    if block['type'] == BlockType.ImageCaption:
+                        para_text += merge_para_with_text(block)
+                for block in para_block['blocks']:  # 2nd.拼image_caption
+                    if block['type'] == BlockType.ImageFootnote:
+                        para_text += merge_para_with_text(block)
+        elif para_type == BlockType.Table:
+            if mode == 'nlp':
+                continue
+            elif mode == 'mm':
+                for block in para_block['blocks']:  # 1st.拼table_caption
+                    if block['type'] == BlockType.TableCaption:
+                        para_text += merge_para_with_text(block)
+                for block in para_block['blocks']:  # 2nd.拼table_body
+                    if block['type'] == BlockType.TableBody:
+                        for line in block['lines']:
+                            for span in line['spans']:
+                                if span['type'] == ContentType.Table:
+                                    # if processed by table model
+                                    if span.get('latex', ''):
+                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    elif span.get('html', ''):
+                                        para_text += f"\n\n{span['html']}\n\n"
+                                    else:
+                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                for block in para_block['blocks']:  # 3rd.拼table_footnote
+                    if block['type'] == BlockType.TableFootnote:
+                        para_text += merge_para_with_text(block)
+
+        if para_text.strip() == '':
+            continue
+        else:
+            page_markdown.append(para_text.strip() + '  ')
+
+    return page_markdown
--- a/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
+++ b/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
--- a/projects/web_demo/web_demo/config/config.yaml
+++ b/projects/web_demo/web_demo/config/config.yaml
@@ -13,6 +13,8 @@ BaseConfig: &base
  PDF_ANALYSIS_FOLDER: "analysis_pdf"
  # 前端项目打包的路径
  REACT_APP_DIST: "../../web/dist/"
+  # 文件访问路径
+  FILE_API: "/api/v2/analysis/pdf_img?as_attachment=False"

 # 开发配置
 DevelopmentConfig: