Unverified Commit 88026879 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2404 from opendatalab/release-1.3.10

Release 1.3.10
parents 1e715d02 2c2fcbe8
...@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte ...@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div> </div>
# Changelog # Changelog
- 2025/04/29 1.3.10 Released
- Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
- Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
- 2025/04/27 1.3.9 Released - 2025/04/27 1.3.9 Released
- Optimized the formula parsing function to improve the success rate of formula rendering - Optimized the formula parsing function to improve the success rate of formula rendering
- Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues - Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
......
...@@ -47,6 +47,9 @@ ...@@ -47,6 +47,9 @@
</div> </div>
# 更新记录 # 更新记录
- 2025/04/29 1.3.10 发布
- 支持使用自定义公式标识符,可通过修改用户目录下的`magic-pdf.json`文件中的`latex-delimiter-config`项实现。
- 锁定`pdfminer.six``20250324`版本,以避免新版本导致的解析失败问题。
- 2025/04/27 1.3.9 发布 - 2025/04/27 1.3.9 发布
- 优化公式解析功能,提升公式渲染的成功率 - 优化公式解析功能,提升公式渲染的成功率
- 更新`pdfminer.six`到最新版本,修复了部分pdf解析异常问题 - 更新`pdfminer.six`到最新版本,修复了部分pdf解析异常问题
......
...@@ -20,6 +20,16 @@ ...@@ -20,6 +20,16 @@
"enable": true, "enable": true,
"max_time": 400 "max_time": 400
}, },
"latex-delimiter-config": {
"display": {
"left": "$$",
"right": "$$"
},
"inline": {
"left": "$",
"right": "$"
}
},
"llm-aided-config": { "llm-aided-config": {
"formula_aided": { "formula_aided": {
"api_key": "your_api_key", "api_key": "your_api_key",
...@@ -40,5 +50,5 @@ ...@@ -40,5 +50,5 @@
"enable": false "enable": false
} }
}, },
"config_version": "1.2.0" "config_version": "1.2.1"
} }
\ No newline at end of file
...@@ -5,6 +5,7 @@ from loguru import logger ...@@ -5,6 +5,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.config_reader import get_latex_delimiter_config
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.post_proc.para_split_v3 import ListLineTag from magic_pdf.post_proc.para_split_v3 import ListLineTag
...@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str: ...@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
result.append(char) result.append(char)
return ''.join(result) return ''.join(result)
latex_delimiters_config = get_latex_delimiter_config()
default_delimiters = {
'display': {'left': '$$', 'right': '$$'},
'inline': {'left': '$', 'right': '$'}
}
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
display_left_delimiter = delimiters['display']['left']
display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
block_text = '' block_text = ''
...@@ -168,9 +182,9 @@ def merge_para_with_text(para_block): ...@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = ocr_escape_special_markdown_char(span['content']) content = ocr_escape_special_markdown_char(span['content'])
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$" content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
content = content.strip() content = content.strip()
......
...@@ -125,6 +125,15 @@ def get_llm_aided_config(): ...@@ -125,6 +125,15 @@ def get_llm_aided_config():
else: else:
return llm_aided_config return llm_aided_config
def get_latex_delimiter_config():
config = read_config()
latex_delimiter_config = config.get('latex-delimiter-config')
if latex_delimiter_config is None:
logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
return None
else:
return latex_delimiter_config
if __name__ == '__main__': if __name__ == '__main__':
ak, sk, endpoint = get_s3_config('llm-raw') ak, sk, endpoint = get_s3_config('llm-raw')
...@@ -342,7 +342,10 @@ REPLACEMENTS_PATTERNS = { ...@@ -342,7 +342,10 @@ REPLACEMENTS_PATTERNS = {
re.compile(r'\\Tilde'): r'\\tilde', re.compile(r'\\Tilde'): r'\\tilde',
re.compile(r'\\slash'): r'/', re.compile(r'\\slash'): r'/',
re.compile(r'\\textperthousand'): r'‰', re.compile(r'\\textperthousand'): r'‰',
re.compile(r'\\sun'): r'☉' re.compile(r'\\sun'): r'☉',
re.compile(r'\\textunderscore'): r'\\_',
re.compile(r'\\fint'): r'⨏',
re.compile(r'\\up '): r'\\ ',
} }
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)') QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
......
...@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0 ...@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
tables_inside = [j for j in range(len(table_res_list)) tables_inside = [j for j in range(len(table_res_list))
if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)] if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
# Continue if there are at least 2 tables inside # Continue if there are at least 3 tables inside
if len(tables_inside) >= 2: if len(tables_inside) >= 3:
# Check if inside tables overlap with each other # Check if inside tables overlap with each other
tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]]) tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
for idx1 in range(len(tables_inside)) for idx1 in range(len(tables_inside))
......
...@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table ...@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
return md_content, txt_content, archive_zip_path, new_pdf_path return md_content, txt_content, archive_zip_path, new_pdf_path
latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True}, latex_delimiters = [
{'left': '$', 'right': '$', 'display': False}] {'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False},
{'left': '\\(', 'right': '\\)', 'display': False},
{'left': '\\[', 'right': '\\]', 'display': True},
]
def init_model(): def init_model():
...@@ -218,7 +222,8 @@ if __name__ == '__main__': ...@@ -218,7 +222,8 @@ if __name__ == '__main__':
with gr.Tabs(): with gr.Tabs():
with gr.Tab('Markdown rendering'): with gr.Tab('Markdown rendering'):
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True) latex_delimiters=latex_delimiters,
line_breaks=True)
with gr.Tab('Markdown text'): with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True) md_text = gr.TextArea(lines=45, show_copy_button=True)
file.change(fn=to_pdf, inputs=file, outputs=pdf_show) file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
......
...@@ -10,6 +10,6 @@ scikit-learn>=1.0.2 ...@@ -10,6 +10,6 @@ scikit-learn>=1.0.2
torch>=2.2.2,!=2.5.0,!=2.5.1 torch>=2.2.2,!=2.5.0,!=2.5.1
torchvision torchvision
transformers>=4.49.0,!=4.51.0,<5.0.0 transformers>=4.49.0,!=4.51.0,<5.0.0
pdfminer.six>=20250416 pdfminer.six==20250324
tqdm>=4.67.1 tqdm>=4.67.1
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment