Commit 100e9c17 authored by myhloli's avatar myhloli
Browse files

feat(latex): enhance LaTeX delimiter support and configurability

- Add support for \(\) and \[\] delimiters in addition to $$ and $$- Make LaTeX delimiter configuration more flexible and user-defined
- Update configuration file to include LaTeX delimiter settings
- Modify OCR content generation to use configurable delimiters
parent 98dd1790
...@@ -20,6 +20,16 @@ ...@@ -20,6 +20,16 @@
"enable": true, "enable": true,
"max_time": 400 "max_time": 400
}, },
"latex-delimiter-config": {
"display": {
"left": "$$",
"right": "$$"
},
"inline": {
"left": "$",
"right": "$"
}
},
"llm-aided-config": { "llm-aided-config": {
"formula_aided": { "formula_aided": {
"api_key": "your_api_key", "api_key": "your_api_key",
...@@ -40,5 +50,5 @@ ...@@ -40,5 +50,5 @@
"enable": false "enable": false
} }
}, },
"config_version": "1.2.0" "config_version": "1.2.1"
} }
\ No newline at end of file
...@@ -5,6 +5,7 @@ from loguru import logger ...@@ -5,6 +5,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.config_reader import get_latex_delimiter_config
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.post_proc.para_split_v3 import ListLineTag from magic_pdf.post_proc.para_split_v3 import ListLineTag
...@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str: ...@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
result.append(char) result.append(char)
return ''.join(result) return ''.join(result)
latex_delimiters_config = get_latex_delimiter_config()
default_delimiters = {
'display': {'left': '$$', 'right': '$$'},
'inline': {'left': '$', 'right': '$'}
}
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
display_left_delimiter = delimiters['display']['left']
display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
block_text = '' block_text = ''
...@@ -168,9 +182,9 @@ def merge_para_with_text(para_block): ...@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = ocr_escape_special_markdown_char(span['content']) content = ocr_escape_special_markdown_char(span['content'])
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$" content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
content = content.strip() content = content.strip()
......
...@@ -125,6 +125,15 @@ def get_llm_aided_config(): ...@@ -125,6 +125,15 @@ def get_llm_aided_config():
else: else:
return llm_aided_config return llm_aided_config
def get_latex_delimiter_config():
config = read_config()
latex_delimiter_config = config.get('latex-delimiter-config')
if latex_delimiter_config is None:
logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
return None
else:
return latex_delimiter_config
if __name__ == '__main__': if __name__ == '__main__':
ak, sk, endpoint = get_s3_config('llm-raw') ak, sk, endpoint = get_s3_config('llm-raw')
...@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table ...@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
return md_content, txt_content, archive_zip_path, new_pdf_path return md_content, txt_content, archive_zip_path, new_pdf_path
latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True}, latex_delimiters = [
{'left': '$', 'right': '$', 'display': False}] {'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False},
{'left': '\\(', 'right': '\\)', 'display': False},
{'left': '\\[', 'right': '\\]', 'display': True},
]
def init_model(): def init_model():
...@@ -218,7 +222,8 @@ if __name__ == '__main__': ...@@ -218,7 +222,8 @@ if __name__ == '__main__':
with gr.Tabs(): with gr.Tabs():
with gr.Tab('Markdown rendering'): with gr.Tab('Markdown rendering'):
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True) latex_delimiters=latex_delimiters,
line_breaks=True)
with gr.Tab('Markdown text'): with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True) md_text = gr.TextArea(lines=45, show_copy_button=True)
file.change(fn=to_pdf, inputs=file, outputs=pdf_show) file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment