from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.ocr_content_type import ContentType def ocr_mk_nlp_markdown(pdf_info_dict: dict): markdown = [] for _, page_info in pdf_info_dict.items(): blocks = page_info.get("preproc_blocks") if not blocks: continue for block in blocks: for line in block['lines']: line_text = '' for span in line['spans']: if not span.get('content'): continue content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号 if span['type'] == ContentType.InlineEquation: content = f"${content}$" elif span['type'] == ContentType.InterlineEquation: content = f"$$\n{content}\n$$" line_text += content + ' ' # 在行末添加两个空格以强制换行 markdown.append(line_text.strip() + ' ') return '\n'.join(markdown) def ocr_mk_mm_markdown(pdf_info_dict: dict): markdown = [] for _, page_info in pdf_info_dict.items(): blocks = page_info.get("preproc_blocks") if not blocks: continue for block in blocks: for line in block['lines']: line_text = '' for span in line['spans']: if not span.get('content'): if not span.get('image_path'): continue else: content = f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']})" else: content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号 if span['type'] == ContentType.InlineEquation: content = f"${content}$" elif span['type'] == ContentType.InterlineEquation: content = f"$$\n{content}\n$$" line_text += content + ' ' # 在行末添加两个空格以强制换行 markdown.append(line_text.strip() + ' ') return '\n'.join(markdown) def ocr_mk_mm_standard_format(): ''' content_list type string image/text/table/equation(行间的单独拿出来,行内的和text合并) ''' pass