Merge pull request #1427 from opendatalab/release-1.0.0

Release 1.0.0

Merge pull request #1427 from opendatalab/release-1.0.0
Release 1.0.0
4bb54393 · Xiaomeng Zhao · GitHub · 04f084ac · 1c9f9942 · 4bb54393
Unverified Commit 4bb54393 authored Jan 10, 2025 by Xiaomeng Zhao Committed by GitHub Jan 10, 2025
20 changed files
--- a/magic_pdf/post_proc/llm_aided.py
+++ b/magic_pdf/post_proc/llm_aided.py
+# Copyright (c) Opendatalab. All rights reserved.
+import json
+from loguru import logger
+from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
+from openai import OpenAI
+
+
+#@todo: 有的公式以"\"结尾，这样会导致尾部拼接的"$"被转义，也需要修复
+formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误，确保公式能够渲染且符合原始内容：
+
+1. 修正渲染或编译错误：
+    - Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
+    - 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
+
+2. 保留原始信息：
+   - 保留原始公式中的所有重要信息
+   - 不要添加任何原始公式中没有的新信息
+
+IMPORTANT:请仅返回修正后的公式，不要包含任何介绍、解释或元数据。
+
+LaTeX recognition result:
+$FORMULA
+
+Your corrected result:
+"""
+
+text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误，确保文本连贯并符合原始内容：
+
+1. 修正OCR引起的拼写错误和错误：
+   - 修正常见的OCR错误（例如，'rn' 被误读为 'm'）
+   - 使用上下文和常识进行修正
+   - 只修正明显的错误，不要不必要的修改内容
+   - 不要添加额外的句号或其他不必要的标点符号
+
+2. 保持原始结构：
+   - 保留所有标题和子标题
+
+3. 保留原始内容：
+   - 保留原始文本中的所有重要信息
+   - 不要添加任何原始文本中没有的新信息
+   - 保留段落之间的换行符
+
+4. 保持连贯性：
+   - 确保内容与前文顺畅连接
+   - 适当处理在句子中间开始或结束的文本
+   
+5. 修正行内公式：
+   - 去除行内公式前后多余的空格
+   - 修正公式中的OCR错误
+   - 确保公式能够通过KaTeX渲染
+   
+6. 修正全角字符
+    - 修正全角标点符号为半角标点符号
+    - 修正全角字母为半角字母
+    - 修正全角数字为半角数字
+
+IMPORTANT:请仅返回修正后的文本，保留所有原始格式，包括换行符。不要包含任何介绍、解释或元数据。
+
+Previous context:
+
+Current chunk to process:
+
+Corrected text:
+"""
+
+def llm_aided_formula(pdf_info_dict, formula_aided_config):
+    pass
+
+def llm_aided_text(pdf_info_dict, text_aided_config):
+    pass
+
+def llm_aided_title(pdf_info_dict, title_aided_config):
+    client = OpenAI(
+        api_key=title_aided_config["api_key"],
+        base_url=title_aided_config["base_url"],
+    )
+    title_dict = {}
+    origin_title_list = []
+    i = 0
+    for page_num, page in pdf_info_dict.items():
+        blocks = page["para_blocks"]
+        for block in blocks:
+            if block["type"] == "title":
+                origin_title_list.append(block)
+                title_text = merge_para_with_text(block)
+                title_dict[f"{i}"] = title_text
+                i += 1
+    # logger.info(f"Title list: {title_dict}")
+
+    title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典，请根据以下指南优化标题的结果，使结果符合正常文档的层次结构：
+
+1. 保留原始内容：
+    - 输入的字典中所有元素都是有效的，不能删除字典中的任何元素
+    - 请务必保证输出的字典中元素的数量和输入的数量一致
+
+2. 保持字典内key-value的对应关系不变
+
+3. 优化层次结构：
+    - 为每个标题元素添加适当的层次结构
+    - 标题层级应具有连续性，不能跳过某一层级
+    - 标题层级最多为4级，不要添加过多的层级
+    - 优化后的标题为一个整数，代表该标题的层级
+
+IMPORTANT: 
+请直接返回优化过的由标题层级组成的json，返回的json不需要格式化。
+
+Input title list:
+{title_dict}
+
+Corrected title list:
+"""
+
+    completion = client.chat.completions.create(
+        model=title_aided_config["model"],
+        messages=[
+            {'role': 'user', 'content': title_optimize_prompt}],
+        temperature=0.7,
+    )
+
+    json_completion = json.loads(completion.choices[0].message.content)
+
+    # logger.info(f"Title completion: {json_completion}")
+
+    # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
+    if len(json_completion) == len(title_dict):
+        try:
+            for i, origin_title_block in enumerate(origin_title_list):
+               origin_title_block["level"] = int(json_completion[str(i)])
+        except Exception as e:
+            logger.exception(e)
+    else:
+        logger.error("The number of titles in the optimized result is not equal to the number of titles in the input.")
+
--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -33,6 +33,14 @@ def remove_overlaps_low_confidence_spans(spans):
    return spans, dropped_spans


+def check_chars_is_overlap_in_span(chars):
+    for i in range(len(chars)):
+        for j in range(i + 1, len(chars)):
+            if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
+                return True
+    return False
+
+
 def remove_overlaps_min_spans(spans):
    dropped_spans = []
    #  删除重叠spans中较小的那些

--- a/magic_pdf/pre_proc/remove_bbox_overlap.py
+++ b/magic_pdf/pre_proc/remove_bbox_overlap.py
@@ -70,7 +70,7 @@ def _remove_overlap_between_bboxes(arr):
                    res[i] = None
                else:
                    keeps[idx] = False
-                drop_reasons.append(drop_reasons)
+                drop_reasons.append(drop_reason)
        if keeps[idx]:
            res[idx] = v
    return res, drop_reasons

--- a/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
+++ b/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
--- a/magic_pdf/rw/AbsReaderWriter.py
+++ b/magic_pdf/rw/AbsReaderWriter.py
-from abc import ABC, abstractmethod
-
-
-class AbsReaderWriter(ABC):
-    MODE_TXT = "text"
-    MODE_BIN = "binary"
-    @abstractmethod
-    def read(self, path: str, mode=MODE_TXT):
-        raise NotImplementedError
-
-    @abstractmethod
-    def write(self, content: str, path: str, mode=MODE_TXT):
-        raise NotImplementedError
-
-    @abstractmethod
-    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
-        raise NotImplementedError
--- a/magic_pdf/rw/DiskReaderWriter.py
+++ b/magic_pdf/rw/DiskReaderWriter.py
-import os
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from loguru import logger
-
-
-class DiskReaderWriter(AbsReaderWriter):
-    def __init__(self, parent_path, encoding="utf-8"):
-        self.path = parent_path
-        self.encoding = encoding
-
-    def read(self, path, mode=AbsReaderWriter.MODE_TXT):
-        if os.path.isabs(path):
-            abspath = path
-        else:
-            abspath = os.path.join(self.path, path)
-        if not os.path.exists(abspath):
-            logger.error(f"file {abspath} not exists")
-            raise Exception(f"file {abspath} no exists")
-        if mode == AbsReaderWriter.MODE_TXT:
-            with open(abspath, "r", encoding=self.encoding) as f:
-                return f.read()
-        elif mode == AbsReaderWriter.MODE_BIN:
-            with open(abspath, "rb") as f:
-                return f.read()
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-
-    def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
-        if os.path.isabs(path):
-            abspath = path
-        else:
-            abspath = os.path.join(self.path, path)
-        directory_path = os.path.dirname(abspath)
-        if not os.path.exists(directory_path):
-            os.makedirs(directory_path)
-        if mode == AbsReaderWriter.MODE_TXT:
-            with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
-                f.write(content)
-
-        elif mode == AbsReaderWriter.MODE_BIN:
-            with open(abspath, "wb") as f:
-                f.write(content)
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-
-    def read_offset(self, path: str, offset=0, limit=None):
-        abspath = path
-        if not os.path.isabs(path):
-            abspath = os.path.join(self.path, path)
-        with open(abspath, "rb") as f:
-            f.seek(offset)
-            return f.read(limit)
-
-
-if __name__ == "__main__":
-    if 0:
-        file_path = "io/test/example.txt"
-        drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
-
-        # 写入内容到文件
-        drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
-
-        # 从文件读取内容
-        content = drw.read(path=file_path)
-        if content:
-            logger.info(f"从 {file_path} 读取的内容: {content}")
-    if 1:
-        drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
-        content_bin = drw.read_offset("1.txt")
-        assert content_bin == b"ABCD!"
-
-        content_bin = drw.read_offset("1.txt", offset=1, limit=2)
-        assert content_bin == b"BC"
-
--- a/magic_pdf/rw/S3ReaderWriter.py
+++ b/magic_pdf/rw/S3ReaderWriter.py
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.libs.commons import parse_bucket_key, join_path
-import boto3
-from loguru import logger
-from botocore.config import Config
-
-
-class S3ReaderWriter(AbsReaderWriter):
-    def __init__(
-        self,
-        ak: str,
-        sk: str,
-        endpoint_url: str,
-        addressing_style: str = "auto",
-        parent_path: str = "",
-    ):
-        self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
-        self.path = parent_path
-
-    def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
-        s3_client = boto3.client(
-            service_name="s3",
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=endpoint_url,
-            config=Config(
-                s3={"addressing_style": addressing_style},
-                retries={"max_attempts": 5, "mode": "standard"},
-            ),
-        )
-        return s3_client
-
-    def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
-        if s3_relative_path.startswith("s3://"):
-            s3_path = s3_relative_path
-        else:
-            s3_path = join_path(self.path, s3_relative_path)
-        bucket_name, key = parse_bucket_key(s3_path)
-        res = self.client.get_object(Bucket=bucket_name, Key=key)
-        body = res["Body"].read()
-        if mode == AbsReaderWriter.MODE_TXT:
-            data = body.decode(encoding)  # Decode bytes to text
-        elif mode == AbsReaderWriter.MODE_BIN:
-            data = body
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-        return data
-
-    def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
-        if s3_relative_path.startswith("s3://"):
-            s3_path = s3_relative_path
-        else:
-            s3_path = join_path(self.path, s3_relative_path)
-        if mode == AbsReaderWriter.MODE_TXT:
-            body = content.encode(encoding)  # Encode text data as bytes
-        elif mode == AbsReaderWriter.MODE_BIN:
-            body = content
-        else:
-            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-        bucket_name, key = parse_bucket_key(s3_path)
-        self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
-        logger.info(f"内容已写入 {s3_path} ")
-
-    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
-        if path.startswith("s3://"):
-            s3_path = path
-        else:
-            s3_path = join_path(self.path, path)
-        bucket_name, key = parse_bucket_key(s3_path)
-
-        range_header = (
-            f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
-        )
-        res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
-        return res["Body"].read()
-
-
-if __name__ == "__main__":
-    if 0:
-        # Config the connection info
-        ak = ""
-        sk = ""
-        endpoint_url = ""
-        addressing_style = "auto"
-        bucket_name = ""
-        # Create an S3ReaderWriter object
-        s3_reader_writer = S3ReaderWriter(
-            ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
-        )
-
-        # Write text data to S3
-        text_data = "This is some text data"
-        s3_reader_writer.write(
-            text_data,
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
-            mode=AbsReaderWriter.MODE_TXT,
-        )
-
-        # Read text data from S3
-        text_data_read = s3_reader_writer.read(
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
-        )
-        logger.info(f"Read text data from S3: {text_data_read}")
-        # Write binary data to S3
-        binary_data = b"This is some binary data"
-        s3_reader_writer.write(
-            text_data,
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
-            mode=AbsReaderWriter.MODE_BIN,
-        )
-
-        # Read binary data from S3
-        binary_data_read = s3_reader_writer.read(
-            s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
-        )
-        logger.info(f"Read binary data from S3: {binary_data_read}")
-
-        # Range Read text data from S3
-        binary_data_read = s3_reader_writer.read_offset(
-            path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
-        )
-        logger.info(f"Read binary data from S3: {binary_data_read}")
-    if 1:
-        import os
-        import json
-
-        ak = os.getenv("AK", "")
-        sk = os.getenv("SK", "")
-        endpoint_url = os.getenv("ENDPOINT", "")
-        bucket = os.getenv("S3_BUCKET", "")
-        prefix = os.getenv("S3_PREFIX", "")
-        key_basename = os.getenv("S3_KEY_BASENAME", "")
-        s3_reader_writer = S3ReaderWriter(
-            ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
-        )
-        content_bin = s3_reader_writer.read_offset(key_basename)
-        assert content_bin[:10] == b'{"track_id'
-        assert content_bin[-10:] == b'r":null}}\n'
-
-        content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
-        jso = json.dumps(content_bin.decode("utf-8"))
-        print(jso)
--- a/magic_pdf/rw/__init__.py
+++ b/magic_pdf/rw/__init__.py
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
 import os
-from pathlib import Path
-
+import shutil
+import tempfile
 import click
+import fitz
 from loguru import logger
+from pathlib import Path

 import magic_pdf.model as model_config
 from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.libs.version import __version__
 from magic_pdf.tools.common import do_parse, parse_pdf_methods
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
+
+pdf_suffixes = ['.pdf']
+ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
+image_suffixes = ['.png', '.jpeg', '.jpg']


 @click.command()
@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
    'path',
    type=click.Path(exists=True),
    required=True,
-    help='local pdf filepath or directory',
+    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
 )
 @click.option(
    '-o',
@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
    model_config.__use_inside_model__ = True
    model_config.__model_mode__ = 'full'
    os.makedirs(output_dir, exist_ok=True)
+    temp_dir = tempfile.mkdtemp()
+    def read_fn(path: Path):
+        if path.suffix in ms_office_suffixes:
+            convert_file_to_pdf(str(path), temp_dir)
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+        elif path.suffix in image_suffixes:
+            with open(str(path), 'rb') as f:
+                bits = f.read()
+            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            with open(fn, 'wb') as f:
+                f.write(pdf_bytes)
+        elif path.suffix in pdf_suffixes:
+            fn = str(path)
+        else:
+            raise Exception(f"Unknown file suffix: {path.suffix}")
+        
+        disk_rw = FileBasedDataReader(os.path.dirname(fn))
+        return disk_rw.read(os.path.basename(fn))

-    def read_fn(path):
-        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path))
-
-    def parse_doc(doc_path: str):
+    def parse_doc(doc_path: Path):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
            logger.exception(e)

    if os.path.isdir(path):
-        for doc_path in Path(path).glob('*.pdf'):
-            parse_doc(doc_path)
+        for doc_path in Path(path).glob('*'):
+            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
+                parse_doc(doc_path)
    else:
-        parse_doc(path)
+        parse_doc(Path(path))
+
+    shutil.rmtree(temp_dir)


 if __name__ == '__main__':

--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -9,8 +9,9 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.libs.draw_bbox import draw_char_bbox
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.model.operators import InferenceResult
+from magic_pdf.operators.models import InferenceResult

 # from io import BytesIO
 # from pypdf import PdfReader, PdfWriter
@@ -83,6 +84,7 @@ def do_parse(
    f_make_md_mode=MakeMode.MM_MD,
    f_draw_model_bbox=False,
    f_draw_line_sort_bbox=False,
+    f_draw_char_bbox=False,
    start_page_id=0,
    end_page_id=None,
    lang=None,
@@ -94,9 +96,7 @@ def do_parse(
        logger.warning('debug mode is on')
        f_draw_model_bbox = True
        f_draw_line_sort_bbox = True
-
-    if lang == '':
-        lang = None
+        # f_draw_char_bbox = True

    pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
        pdf_bytes, start_page_id, end_page_id
@@ -109,7 +109,7 @@ def do_parse(
    )
    image_dir = str(os.path.basename(local_image_dir))

-    ds = PymuDocDataset(pdf_bytes)
+    ds = PymuDocDataset(pdf_bytes, lang=lang)

    if len(model_list) == 0:
        if model_config.__use_inside_model__:
@@ -118,50 +118,50 @@ def do_parse(
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=False,
-                        lang=lang,
+                        lang=ds._lang,
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
                    pipe_result = infer_result.pipe_txt_mode(
-                        image_writer, debug_mode=True, lang=lang
+                        image_writer, debug_mode=True, lang=ds._lang
                    )
                else:
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=True,
-                        lang=lang,
+                        lang=ds._lang,
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
                    pipe_result = infer_result.pipe_ocr_mode(
-                        image_writer, debug_mode=True, lang=lang
+                        image_writer, debug_mode=True, lang=ds._lang
                    )

            elif parse_method == 'txt':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=False,
-                    lang=lang,
+                    lang=ds._lang,
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_txt_mode(
-                    image_writer, debug_mode=True, lang=lang
+                    image_writer, debug_mode=True, lang=ds._lang
                )
            elif parse_method == 'ocr':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=True,
-                    lang=lang,
+                    lang=ds._lang,
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_ocr_mode(
-                    image_writer, debug_mode=True, lang=lang
+                    image_writer, debug_mode=True, lang=ds._lang
                )
            else:
                logger.error('unknown parse method')
@@ -170,19 +170,26 @@ def do_parse(
            logger.error('need model list input')
            exit(2)
    else:
+
        infer_result = InferenceResult(model_list, ds)
        if parse_method == 'ocr':
            pipe_result = infer_result.pipe_ocr_mode(
-                image_writer, debug_mode=True, lang=lang
+                image_writer, debug_mode=True, lang=ds._lang
            )
        elif parse_method == 'txt':
            pipe_result = infer_result.pipe_txt_mode(
-                image_writer, debug_mode=True, lang=lang
+                image_writer, debug_mode=True, lang=ds._lang
            )
        else:
-            pipe_result = infer_result.pipe_auto_mode(
-                image_writer, debug_mode=True, lang=lang
-            )
+            if ds.classify() == SupportedPdfParseMethod.TXT:
+                pipe_result = infer_result.pipe_txt_mode(
+                        image_writer, debug_mode=True, lang=ds._lang
+                    )
+            else:
+                pipe_result = infer_result.pipe_ocr_mode(
+                        image_writer, debug_mode=True, lang=ds._lang
+                    )
+

    if f_draw_model_bbox:
        infer_result.draw_model(
@@ -201,6 +208,9 @@ def do_parse(
            os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
        )

+    if f_draw_char_bbox:
+        draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
+
    if f_dump_md:
        pipe_result.dump_md(
            md_writer,

--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
-"""用户输入： model数组，每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
-
-然后：
-    1）根据s3路径，调用spark集群的api,拿到ak,sk,endpoint，构造出s3PDFReader
-    2）根据用户输入的s3地址，调用spark集群的api,拿到ak,sk,endpoint，构造出s3ImageWriter
-
-其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖！！！
-"""
-
-from loguru import logger
-
-from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.libs.version import __version__
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
-from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
-from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
-
-
-def parse_txt_pdf(
-    dataset: Dataset,
-    model_list: list,
-    imageWriter: DataWriter,
-    is_debug=False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    *args,
-    **kwargs
-):
-    """解析文本类pdf."""
-    pdf_info_dict = parse_pdf_by_txt(
-        dataset,
-        model_list,
-        imageWriter,
-        start_page_id=start_page_id,
-        end_page_id=end_page_id,
-        debug_mode=is_debug,
-        lang=lang,
-    )
-
-    pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
-
-    pdf_info_dict['_version_name'] = __version__
-
-    if lang is not None:
-        pdf_info_dict['_lang'] = lang
-
-    return pdf_info_dict
-
-
-def parse_ocr_pdf(
-    dataset: Dataset,
-    model_list: list,
-    imageWriter: DataWriter,
-    is_debug=False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    *args,
-    **kwargs
-):
-    """解析ocr类pdf."""
-    pdf_info_dict = parse_pdf_by_ocr(
-        dataset,
-        model_list,
-        imageWriter,
-        start_page_id=start_page_id,
-        end_page_id=end_page_id,
-        debug_mode=is_debug,
-        lang=lang,
-    )
-
-    pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
-
-    pdf_info_dict['_version_name'] = __version__
-
-    if lang is not None:
-        pdf_info_dict['_lang'] = lang
-
-    return pdf_info_dict
-
-
-def parse_union_pdf(
-    dataset: Dataset,
-    model_list: list,
-    imageWriter: DataWriter,
-    is_debug=False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    *args,
-    **kwargs
-):
-    """ocr和文本混合的pdf，全部解析出来."""
-
-    def parse_pdf(method):
-        try:
-            return method(
-                dataset,
-                model_list,
-                imageWriter,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-                debug_mode=is_debug,
-                lang=lang,
-            )
-        except Exception as e:
-            logger.exception(e)
-            return None
-
-    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
-    if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
-        logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
-        if len(model_list) == 0:
-            layout_model = kwargs.get('layout_model', None)
-            formula_enable = kwargs.get('formula_enable', None)
-            table_enable = kwargs.get('table_enable', None)
-            infer_res = doc_analyze(
-                dataset,
-                ocr=True,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-                lang=lang,
-                layout_model=layout_model,
-                formula_enable=formula_enable,
-                table_enable=table_enable,
-            )
-            model_list = infer_res.get_infer_res()
-        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
-        if pdf_info_dict is None:
-            raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
-        else:
-            pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
-    else:
-        pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
-
-    pdf_info_dict['_version_name'] = __version__
-
-    if lang is not None:
-        pdf_info_dict['_lang'] = lang
-
-    return pdf_info_dict
--- a/magic_pdf/utils/office_to_pdf.py
+++ b/magic_pdf/utils/office_to_pdf.py
+import os
+import subprocess
+from pathlib import Path
+
+
+class ConvertToPdfError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+        super().__init__(self.msg)
+
+
+def convert_file_to_pdf(input_path, output_dir):
+    if not os.path.isfile(input_path):
+        raise FileNotFoundError(f"The input file {input_path} does not exist.")
+
+    os.makedirs(output_dir, exist_ok=True)
+    
+    cmd = [
+        'soffice',
+        '--headless',
+        '--convert-to', 'pdf',
+        '--outdir', str(output_dir),
+        str(input_path)
+    ]
+    
+    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    
+    if process.returncode != 0:
+        raise ConvertToPdfError(process.stderr.decode())
--- a/next_docs/README.md
+++ b/next_docs/README.md
--- a/next_docs/README_zh-CN.md
+++ b/next_docs/README_zh-CN.md
--- a/next_docs/en/_static/image/inference_result.png
+++ b/next_docs/en/_static/image/inference_result.png
--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
@@ -4,8 +4,11 @@ Glossary
 ===========

 1. jsonl 
-    TODO: add description
+    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
+    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
+
+
+2. magic-pdf.json 
+    TODO

-2. magic-pdf.json
-    TODO: add description

--- a/next_docs/en/api/model_operators.rst
+++ b/next_docs/en/api/model_operators.rst
@@ -2,7 +2,7 @@
 Model Api
 ==========

-.. autoclass:: magic_pdf.model.InferenceResultBase
+.. autoclass:: magic_pdf.operators.InferenceResultBase
   :members:
   :inherited-members:
   :show-inheritance:
--- a/next_docs/en/api/pipe_operators.rst
+++ b/next_docs/en/api/pipe_operators.rst
@@ -3,7 +3,7 @@
 Pipeline Api
 =============

-.. autoclass:: magic_pdf.pipe.operators.PipeResult
+.. autoclass:: magic_pdf.operators.pipes.PipeResult
   :members:
   :inherited-members:
-   :show-inheritance:
\ No newline at end of file
+   :show-inheritance:
--- a/next_docs/en/index.rst
+++ b/next_docs/en/index.rst
@@ -70,6 +70,12 @@ Key Features
 -  Supports both CPU and GPU environments.
 -  Compatible with Windows, Linux, and Mac platforms.

+
+.. tip::
+
+   Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
+
+
 User Guide
 -------------
 .. toctree::