feat: update project list in README files to reflect compatibility with version 2.0

dff11700 · myhloli · d41179da · d41179da · d41179da · d41179da
Commit dff11700 authored Jun 13, 2025 by myhloli
15 changed files
--- a/projects/web_demo/web_demo/common/ext.py
+++ b/projects/web_demo/web_demo/common/ext.py
-import hashlib
-import mimetypes
-import urllib.parse
-def is_pdf(filename, file):
-    """
-    判断文件是否为PDF格式，支持中文名和特殊字符。
-    :param filename: 文件名
-    :param file: 文件对象
-    :return: 如果文件是PDF格式，则返回True，否则返回False
-    """
-    try:
-        # 对文件名进行URL解码，处理特殊字符
-        decoded_filename = urllib.parse.unquote(filename)
-        # 检查MIME类型
-        mime_type, _ = mimetypes.guess_type(decoded_filename)
-        print(f"Detected MIME type: {mime_type}")
-        # 某些情况下mime_type可能为None，需要特殊处理
-        if mime_type is None:
-            # 只检查文件内容的PDF标识
-            file_start = file.read(5)
-            file.seek(0)  # 重置文件指针
-            return file_start.startswith(b'%PDF-')
-        if mime_type != 'application/pdf':
-            return False
-        # 检查文件内容的PDF标识
-        file_start = file.read(5)
-        file.seek(0)  # 重置文件指针
-        if not file_start.startswith(b'%PDF-'):
-            return False
-        return True
-    except Exception as e:
-        print(f"Error checking PDF format: {str(e)}")
-        # 发生错误时，仍然尝试通过文件头判断
-        try:
-            file_start = file.read(5)
-            file.seek(0)
-            return file_start.startswith(b'%PDF-')
-        except:
-            return False
-def url_is_pdf(file):
-    """
-    判断文件是否为PDF格式。
-    :param file: 文件对象
-    :return: 如果文件是PDF格式，则返回True，否则返回False
-    """
-    # 检查文件内容
-    file_start = file.read(5)
-    file.seek(0)
-    if not file_start.startswith(b'%PDF-'):
-        return False
-    return True
-def calculate_file_hash(file, algorithm='sha256'):
-    """
-    计算给定文件的哈希值。
-    :param file: 文件对象
-    :param algorithm: 哈希算法的名字，如:'sha256', 'md5', 'sha1'等
-    :return: 文件的哈希值
-    """
-    hash_func = getattr(hashlib, algorithm)()
-    block_size = 65536  # 64KB chunks
-    # with open(file_path, 'rb') as file:
-    buffer = file.read(block_size)
-    while len(buffer) > 0:
-        hash_func.update(buffer)
-        buffer = file.read(block_size)
-    file.seek(0)
-    return hash_func.hexdigest()
-def singleton_func(cls):
-    instance = {}
-    def _singleton(*args, **kwargs):
-        if cls not in instance:
-            instance[cls] = cls(*args, **kwargs)
-        return instance[cls]
-    return _singleton
--- a/projects/web_demo/web_demo/common/import_models.py
+++ b/projects/web_demo/web_demo/common/import_models.py
-from api.analysis.models import *
\ No newline at end of file
--- a/projects/web_demo/web_demo/common/logger.py
+++ b/projects/web_demo/web_demo/common/logger.py
-import os
-from loguru import logger
-from pathlib import Path
-from datetime import datetime
-def setup_log(config):
-    """
-    Setup logging
-    :param config:  config file
-    :return:
-    """
-    log_path = os.path.join(Path(__file__).parent.parent, "log")
-    if not Path(log_path).exists():
-        Path(log_path).mkdir(parents=True, exist_ok=True)
-    log_level = config.get("LOG_LEVEL")
-    log_name = f'log_{datetime.now().strftime("%Y-%m-%d")}.log'
-    log_file_path = os.path.join(log_path, log_name)
-    logger.add(str(log_file_path), rotation='00:00', encoding='utf-8', level=log_level, enqueue=True)
--- a/projects/web_demo/web_demo/common/mk_markdown/__init__.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/__init__.py
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/language.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/language.py
-import os
-import unicodedata
-if not os.getenv("FTLANG_CACHE"):
-    current_file_path = os.path.abspath(__file__)
-    current_dir = os.path.dirname(current_file_path)
-    root_dir = os.path.dirname(current_dir)
-    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
-    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
-    # print(os.getenv("FTLANG_CACHE"))
-from fast_langdetect import detect_language
-def detect_lang(text: str) -> str:
-    if len(text) == 0:
-        return ""
-    try:
-        lang_upper = detect_language(text)
-    except:
-        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
-        lang_upper = detect_language(html_no_ctrl_chars)
-    try:
-        lang = lang_upper.lower()
-    except:
-        lang = ""
-    return lang
-if __name__ == '__main__':
-    print(os.getenv("FTLANG_CACHE"))
-    print(detect_lang("This is a test."))
-    print(detect_lang("<html>This is a test</html>"))
-    print(detect_lang("这个是中文测试。"))
-    print(detect_lang("<html>这个是中文测试。</html>"))
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
-import re
-def escape_special_markdown_char(pymu_blocks):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for blk in pymu_blocks:
-        for line in blk['lines']:
-            for span in line['spans']:
-                for char in special_chars:
-                    span_text = span['text']
-                    span_type = span.get("_type", None)
-                    if span_type in ['inline-equation', 'interline-equation']:
-                        continue
-                    elif span_text:
-                        span['text'] = span['text'].replace(char, "\\" + char)
-    return pymu_blocks
-def ocr_escape_special_markdown_char(content):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for char in special_chars:
-        content = content.replace(char, "\\" + char)
-    return content
--- a/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
-class ContentType:
-    Image = 'image'
-    Table = 'table'
-    Text = 'text'
-    InlineEquation = 'inline_equation'
-    InterlineEquation = 'interline_equation'
-class BlockType:
-    Image = 'image'
-    ImageBody = 'image_body'
-    ImageCaption = 'image_caption'
-    ImageFootnote = 'image_footnote'
-    Table = 'table'
-    TableBody = 'table_body'
-    TableCaption = 'table_caption'
-    TableFootnote = 'table_footnote'
-    Text = 'text'
-    Title = 'title'
-    InterlineEquation = 'interline_equation'
-    Footnote = 'footnote'
-    Discarded = 'discarded'
-class CategoryId:
-    Title = 0
-    Text = 1
-    Abandon = 2
-    ImageBody = 3
-    ImageCaption = 4
-    TableBody = 5
-    TableCaption = 6
-    TableFootnote = 7
-    InterlineEquation_Layout = 8
-    InlineEquation = 13
-    InterlineEquation_YOLO = 14
-    OcrText = 15
-    ImageFootnote = 101
--- a/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
+++ b/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
-import re
-import wordninja
-from .libs.language import detect_lang
-from .libs.markdown_utils import ocr_escape_special_markdown_char
-from .libs.ocr_content_type import BlockType, ContentType
-def __is_hyphen_at_line_end(line):
-    """
-    Check if a line ends with one or more letters followed by a hyphen.
-    Args:
-    line (str): The line of text to check.
-    Returns:
-    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
-    """
-    # Use regex to check if the line ends with one or more letters followed by a hyphen
-    return bool(re.search(r'[A-Za-z]+-\s*$', line))
-def split_long_words(text):
-    segments = text.split(' ')
-    for i in range(len(segments)):
-        words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
-        for j in range(len(words)):
-            if len(words[j]) > 10:
-                words[j] = ' '.join(wordninja.split(words[j]))
-        segments[i] = ''.join(words)
-    return ' '.join(segments)
-def join_path(*args):
-    return ''.join(str(s).rstrip('/') for s in args)
-def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
-                                                img_buket_path):
-    markdown_with_para_and_pagination = []
-    page_no = 0
-    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get('para_blocks')
-        if not paras_of_layout:
-            continue
-        page_markdown = ocr_mk_markdown_with_para_core_v2(
-            paras_of_layout, 'mm', img_buket_path)
-        markdown_with_para_and_pagination.append({
-            'page_no':
-                page_no,
-            'md_content':
-                '\n\n'.join(page_markdown)
-        })
-        page_no += 1
-    return markdown_with_para_and_pagination
-def merge_para_with_text(para_block):
-    def detect_language(text):
-        en_pattern = r'[a-zA-Z]+'
-        en_matches = re.findall(en_pattern, text)
-        en_length = sum(len(match) for match in en_matches)
-        if len(text) > 0:
-            if en_length / len(text) >= 0.5:
-                return 'en'
-            else:
-                return 'unknown'
-        else:
-            return 'empty'
-    para_text = ''
-    for line in para_block['lines']:
-        line_text = ''
-        line_lang = ''
-        for span in line['spans']:
-            span_type = span['type']
-            if span_type == ContentType.Text:
-                line_text += span['content'].strip()
-        if line_text != '':
-            line_lang = detect_lang(line_text)
-        for span in line['spans']:
-            span_type = span['type']
-            content = ''
-            if span_type == ContentType.Text:
-                content = span['content']
-                # language = detect_lang(content)
-                language = detect_language(content)
-                if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                    content = ocr_escape_special_markdown_char(
-                        split_long_words(content))
-                else:
-                    content = ocr_escape_special_markdown_char(content)
-            elif span_type == ContentType.InlineEquation:
-                content = f" ${span['content']}$ "
-            elif span_type == ContentType.InterlineEquation:
-                content = f"\n$$\n{span['content']}\n$$\n"
-            if content != '':
-                langs = ['zh', 'ja', 'ko']
-                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                elif line_lang == 'en':
-                    # 如果是前一行带有-连字符，那么末尾不应该加空格
-                    if __is_hyphen_at_line_end(content):
-                        para_text += content[:-1]
-                    else:
-                        para_text += content + ' '
-                else:
-                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
-    return para_text
-def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
-                                      mode,
-                                      img_buket_path=''):
-    page_markdown = []
-    for para_block in paras_of_layout:
-        para_text = ''
-        para_type = para_block['type']
-        if para_type == BlockType.Text:
-            para_text = merge_para_with_text(para_block)
-        elif para_type == BlockType.Title:
-            para_text = f'# {merge_para_with_text(para_block)}'
-        elif para_type == BlockType.InterlineEquation:
-            para_text = merge_para_with_text(para_block)
-        elif para_type == BlockType.Image:
-            if mode == 'nlp':
-                continue
-            elif mode == 'mm':
-                for block in para_block['blocks']:  # 1st.拼image_body
-                    if block['type'] == BlockType.ImageBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Image:
-                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
-                for block in para_block['blocks']:  # 2nd.拼image_caption
-                    if block['type'] == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(block)
-                for block in para_block['blocks']:  # 2nd.拼image_caption
-                    if block['type'] == BlockType.ImageFootnote:
-                        para_text += merge_para_with_text(block)
-        elif para_type == BlockType.Table:
-            if mode == 'nlp':
-                continue
-            elif mode == 'mm':
-                for block in para_block['blocks']:  # 1st.拼table_caption
-                    if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block)
-                for block in para_block['blocks']:  # 2nd.拼table_body
-                    if block['type'] == BlockType.TableBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Table:
-                                    # if processed by table model
-                                    if span.get('latex', ''):
-                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
-                                    elif span.get('html', ''):
-                                        para_text += f"\n\n{span['html']}\n\n"
-                                    else:
-                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
-                for block in para_block['blocks']:  # 3rd.拼table_footnote
-                    if block['type'] == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(block)
-        if para_text.strip() == '':
-            continue
-        else:
-            page_markdown.append(para_text.strip() + '  ')
-    return page_markdown
--- a/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
+++ b/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
--- a/projects/web_demo/web_demo/common/web_hook.py
+++ b/projects/web_demo/web_demo/common/web_hook.py
-def before_request():
-    return None
-def after_request(response):
-    response.headers.add('Access-Control-Allow-Origin', '*')
-    response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
-    return response
--- a/projects/web_demo/web_demo/config/__init__.py
+++ b/projects/web_demo/web_demo/config/__init__.py
--- a/projects/web_demo/web_demo/config/config.yaml
+++ b/projects/web_demo/web_demo/config/config.yaml
-# 基本配置
-BaseConfig: &base
-  DEBUG: false
-  PORT: 5559
-  LOG_LEVEL: "DEBUG"
-  SQLALCHEMY_TRACK_MODIFICATIONS: true
-  SQLALCHEMY_DATABASE_URI: ""
-  PROPAGATE_EXCEPTIONS: true
-  SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
-  JWT_SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
-  JWT_ACCESS_TOKEN_EXPIRES: 3600
-  PDF_UPLOAD_FOLDER: "upload_pdf"
-  PDF_ANALYSIS_FOLDER: "analysis_pdf"
-  # 前端项目打包的路径
-  REACT_APP_DIST: "../../web/dist/"
-  # 文件访问路径
-  FILE_API: "/api/v2/analysis/pdf_img?as_attachment=False"
-# 开发配置
-DevelopmentConfig:
-  <<: *base
-  database:
-    type: sqlite
-    path: config/mineru_web.db
-# 生产配置
-ProductionConfig:
-  <<: *base
-# 测试配置
-TestingConfig:
-  <<: *base
-# 当前使用配置
-CurrentConfig: "DevelopmentConfig"
--- a/projects/web_demo/web_demo/config/mineru_web.db
+++ b/projects/web_demo/web_demo/config/mineru_web.db
--- a/projects/web_demo/web_demo/static/__init__.py
+++ b/projects/web_demo/web_demo/static/__init__.py