Commit dff11700 authored by myhloli's avatar myhloli
Browse files

feat: update project list in README files to reflect compatibility with version 2.0

parent d41179da
import hashlib
import mimetypes
import urllib.parse
def is_pdf(filename, file):
"""
判断文件是否为PDF格式,支持中文名和特殊字符。
:param filename: 文件名
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
try:
# 对文件名进行URL解码,处理特殊字符
decoded_filename = urllib.parse.unquote(filename)
# 检查MIME类型
mime_type, _ = mimetypes.guess_type(decoded_filename)
print(f"Detected MIME type: {mime_type}")
# 某些情况下mime_type可能为None,需要特殊处理
if mime_type is None:
# 只检查文件内容的PDF标识
file_start = file.read(5)
file.seek(0) # 重置文件指针
return file_start.startswith(b'%PDF-')
if mime_type != 'application/pdf':
return False
# 检查文件内容的PDF标识
file_start = file.read(5)
file.seek(0) # 重置文件指针
if not file_start.startswith(b'%PDF-'):
return False
return True
except Exception as e:
print(f"Error checking PDF format: {str(e)}")
# 发生错误时,仍然尝试通过文件头判断
try:
file_start = file.read(5)
file.seek(0)
return file_start.startswith(b'%PDF-')
except:
return False
def url_is_pdf(file):
"""
判断文件是否为PDF格式。
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件内容
file_start = file.read(5)
file.seek(0)
if not file_start.startswith(b'%PDF-'):
return False
return True
def calculate_file_hash(file, algorithm='sha256'):
"""
计算给定文件的哈希值。
:param file: 文件对象
:param algorithm: 哈希算法的名字,如:'sha256', 'md5', 'sha1'等
:return: 文件的哈希值
"""
hash_func = getattr(hashlib, algorithm)()
block_size = 65536 # 64KB chunks
# with open(file_path, 'rb') as file:
buffer = file.read(block_size)
while len(buffer) > 0:
hash_func.update(buffer)
buffer = file.read(block_size)
file.seek(0)
return hash_func.hexdigest()
def singleton_func(cls):
instance = {}
def _singleton(*args, **kwargs):
if cls not in instance:
instance[cls] = cls(*args, **kwargs)
return instance[cls]
return _singleton
from api.analysis.models import *
\ No newline at end of file
import os
from loguru import logger
from pathlib import Path
from datetime import datetime
def setup_log(config):
"""
Setup logging
:param config: config file
:return:
"""
log_path = os.path.join(Path(__file__).parent.parent, "log")
if not Path(log_path).exists():
Path(log_path).mkdir(parents=True, exist_ok=True)
log_level = config.get("LOG_LEVEL")
log_name = f'log_{datetime.now().strftime("%Y-%m-%d")}.log'
log_file_path = os.path.join(log_path, log_name)
logger.add(str(log_file_path), rotation='00:00', encoding='utf-8', level=log_level, enqueue=True)
import os
import unicodedata
if not os.getenv("FTLANG_CACHE"):
current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path)
root_dir = os.path.dirname(current_dir)
ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
# print(os.getenv("FTLANG_CACHE"))
from fast_langdetect import detect_language
def detect_lang(text: str) -> str:
if len(text) == 0:
return ""
try:
lang_upper = detect_language(text)
except:
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_language(html_no_ctrl_chars)
try:
lang = lang_upper.lower()
except:
lang = ""
return lang
if __name__ == '__main__':
print(os.getenv("FTLANG_CACHE"))
print(detect_lang("This is a test."))
print(detect_lang("<html>This is a test</html>"))
print(detect_lang("这个是中文测试。"))
print(detect_lang("<html>这个是中文测试。</html>"))
import re
def escape_special_markdown_char(pymu_blocks):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars = ["*", "`", "~", "$"]
for blk in pymu_blocks:
for line in blk['lines']:
for span in line['spans']:
for char in special_chars:
span_text = span['text']
span_type = span.get("_type", None)
if span_type in ['inline-equation', 'interline-equation']:
continue
elif span_text:
span['text'] = span['text'].replace(char, "\\" + char)
return pymu_blocks
def ocr_escape_special_markdown_char(content):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars = ["*", "`", "~", "$"]
for char in special_chars:
content = content.replace(char, "\\" + char)
return content
class ContentType:
Image = 'image'
Table = 'table'
Text = 'text'
InlineEquation = 'inline_equation'
InterlineEquation = 'interline_equation'
class BlockType:
Image = 'image'
ImageBody = 'image_body'
ImageCaption = 'image_caption'
ImageFootnote = 'image_footnote'
Table = 'table'
TableBody = 'table_body'
TableCaption = 'table_caption'
TableFootnote = 'table_footnote'
Text = 'text'
Title = 'title'
InterlineEquation = 'interline_equation'
Footnote = 'footnote'
Discarded = 'discarded'
class CategoryId:
Title = 0
Text = 1
Abandon = 2
ImageBody = 3
ImageCaption = 4
TableBody = 5
TableCaption = 6
TableFootnote = 7
InterlineEquation_Layout = 8
InlineEquation = 13
InterlineEquation_YOLO = 14
OcrText = 15
ImageFootnote = 101
import re
import wordninja
from .libs.language import detect_lang
from .libs.markdown_utils import ocr_escape_special_markdown_char
from .libs.ocr_content_type import BlockType, ContentType
def __is_hyphen_at_line_end(line):
"""
Check if a line ends with one or more letters followed by a hyphen.
Args:
line (str): The line of text to check.
Returns:
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
"""
# Use regex to check if the line ends with one or more letters followed by a hyphen
return bool(re.search(r'[A-Za-z]+-\s*$', line))
def split_long_words(text):
segments = text.split(' ')
for i in range(len(segments)):
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
for j in range(len(words)):
if len(words[j]) > 10:
words[j] = ' '.join(wordninja.split(words[j]))
segments[i] = ''.join(words)
return ' '.join(segments)
def join_path(*args):
return ''.join(str(s).rstrip('/') for s in args)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
img_buket_path):
markdown_with_para_and_pagination = []
page_no = 0
for page_info in pdf_info_dict:
paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout:
continue
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
markdown_with_para_and_pagination.append({
'page_no':
page_no,
'md_content':
'\n\n'.join(page_markdown)
})
page_no += 1
return markdown_with_para_and_pagination
def merge_para_with_text(para_block):
def detect_language(text):
en_pattern = r'[a-zA-Z]+'
en_matches = re.findall(en_pattern, text)
en_length = sum(len(match) for match in en_matches)
if len(text) > 0:
if en_length / len(text) >= 0.5:
return 'en'
else:
return 'unknown'
else:
return 'empty'
para_text = ''
for line in para_block['lines']:
line_text = ''
line_lang = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
if line_text != '':
line_lang = detect_lang(line_text)
for span in line['spans']:
span_type = span['type']
content = ''
if span_type == ContentType.Text:
content = span['content']
# language = detect_lang(content)
language = detect_language(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(
split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
if content != '':
langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
elif line_lang == 'en':
# 如果是前一行带有-连字符,那么末尾不应该加空格
if __is_hyphen_at_line_end(content):
para_text += content[:-1]
else:
para_text += content + ' '
else:
para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
return para_text
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
mode,
img_buket_path=''):
page_markdown = []
for para_block in paras_of_layout:
para_text = ''
para_type = para_block['type']
if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title:
para_text = f'# {merge_para_with_text(para_block)}'
elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image:
if mode == 'nlp':
continue
elif mode == 'mm':
for block in para_block['blocks']: # 1st.拼image_body
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block)
elif para_type == BlockType.Table:
if mode == 'nlp':
continue
elif mode == 'mm':
for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TableBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Table:
# if processed by table model
if span.get('latex', ''):
para_text += f"\n\n$\n {span['latex']}\n$\n\n"
elif span.get('html', ''):
para_text += f"\n\n{span['html']}\n\n"
else:
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip() + ' ')
return page_markdown
def before_request():
return None
def after_request(response):
response.headers.add('Access-Control-Allow-Origin', '*')
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
return response
# 基本配置
BaseConfig: &base
DEBUG: false
PORT: 5559
LOG_LEVEL: "DEBUG"
SQLALCHEMY_TRACK_MODIFICATIONS: true
SQLALCHEMY_DATABASE_URI: ""
PROPAGATE_EXCEPTIONS: true
SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
JWT_SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
JWT_ACCESS_TOKEN_EXPIRES: 3600
PDF_UPLOAD_FOLDER: "upload_pdf"
PDF_ANALYSIS_FOLDER: "analysis_pdf"
# 前端项目打包的路径
REACT_APP_DIST: "../../web/dist/"
# 文件访问路径
FILE_API: "/api/v2/analysis/pdf_img?as_attachment=False"
# 开发配置
DevelopmentConfig:
<<: *base
database:
type: sqlite
path: config/mineru_web.db
# 生产配置
ProductionConfig:
<<: *base
# 测试配置
TestingConfig:
<<: *base
# 当前使用配置
CurrentConfig: "DevelopmentConfig"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment