Merge pull request #2622 from myhloli/dev

Dev

Merge pull request #2622 from myhloli/dev
Dev
bcbbee8c · Xiaomeng Zhao · GitHub · 3cc3f754 · ced5a7b4 · 3cc3f754
Unverified Commit bcbbee8c authored Jun 13, 2025 by Xiaomeng Zhao Committed by GitHub Jun 13, 2025
20 changed files
--- a/next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
+++ b/next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
-
-转换为 Markdown 文件
-========================
-
-本地文件示例
-^^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.config.enums import SupportedPdfParseMethod
-
-    # args
-    pdf_file_name = "abc.pdf"  # replace with the real pdf path
-    name_without_suff = pdf_file_name.split(".")[0]
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-    image_dir = str(os.path.basename(local_image_dir))
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ## inference
-    if ds.classify() == SupportedPdfParseMethod.OCR:
-        infer_result = ds.apply(doc_analyze, ocr=True)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
-    else:
-        infer_result = ds.apply(doc_analyze, ocr=False)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_txt_mode(image_writer)
-
-    ### draw model result on each page
-    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
-
-    ### draw layout result on each page
-    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
-
-    ### draw spans result on each page
-    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
-
-    ### dump markdown
-    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-    ### dump content list
-    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-
-
-对象存储文件示例
-^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-
-    bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
-    ak = "{Your S3 access key}"  # replace with real s3 access key
-    sk = "{Your S3 secret key}"  # replace with real s3 secret key
-    endpoint_url = "{Your S3 endpoint_url}"  # replace with real s3 endpoint_url
-
-
-    reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url)  # replace `unittest/tmp` with the real s3 prefix
-    writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
-    image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
-
-    # args
-    pdf_file_name = (
-        "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf"  # replace with the real s3 path
-    )
-
-    # prepare env
-    local_dir = "output"
-    name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
-
-    # read bytes
-    pdf_bytes = reader.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ## inference
-    if ds.classify() == SupportedPdfParseMethod.OCR:
-        infer_result = ds.apply(doc_analyze, ocr=True)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
-    else:
-        infer_result = ds.apply(doc_analyze, ocr=False)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_txt_mode(image_writer)
-
-    ### draw model result on each page
-    infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf'))  # dump to local
-
-    ### draw layout result on each page
-    pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf'))  # dump to local
-
-    ### draw spans result on each page
-    pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf'))  # dump to local
-
-    ### dump markdown
-    pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")  # dump to remote s3
-
-    ### dump content list
-    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-
-前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
--- a/next_docs/zh_cn/user_guide/tutorial.rst
+++ b/next_docs/zh_cn/user_guide/tutorial.rst
-
-教程
-===========
-
-让我们通过构建一个最小项目来学习 MinerU 
-
-.. toctree::
-    :maxdepth: 1
-    :caption: 教程
-
-    tutorial/output_file_description
-    tutorial/pipeline
-
--- a/next_docs/zh_cn/user_guide/tutorial/output_file_description.rst
+++ b/next_docs/zh_cn/user_guide/tutorial/output_file_description.rst
-
-输出文件格式介绍
-===============
-
-``magic-pdf`` 命令执行后除了输出和 markdown
-有关的文件以外，还会生成若干个和 markdown
-无关的文件。现在将一一介绍这些文件
-
-some_pdf_layout.pdf
-~~~~~~~~~~~~~~~~~~~
-
-每一页的 layout 均由一个或多个框组成。
-每个框左上脚的数字表明它们的序号。此外 layout.pdf
-框内用不同的背景色块圈定不同的内容块。
-
-.. figure:: ../../_static/image/layout_example.png
-   :alt: layout 页面示例
-
-   layout 页面示例
-
-some_pdf_spans.pdf
-~~~~~~~~~~~~~~~~~~
-
-根据 span 类型的不同，采用不同颜色线框绘制页面上所有
-span。该文件可以用于质检，可以快速排查出文本丢失、行间公式未识别等问题。
-
-.. figure:: ../../_static/image/spans_example.png
-   :alt: span 页面示例
-
-   span 页面示例
-
-some_pdf_model.json
-~~~~~~~~~~~~~~~~~~~
-
-结构定义
-^^^^^^^^
-
-.. code:: python
-
-   from pydantic import BaseModel, Field
-   from enum import IntEnum
-
-   class CategoryType(IntEnum):
-        title = 0               # 标题
-        plain_text = 1          # 文本
-        abandon = 2             # 包括页眉页脚页码和页面注释
-        figure = 3              # 图片
-        figure_caption = 4      # 图片描述
-        table = 5               # 表格
-        table_caption = 6       # 表格描述
-        table_footnote = 7      # 表格注释
-        isolate_formula = 8     # 行间公式
-        formula_caption = 9     # 行间公式的标号
-
-        embedding = 13          # 行内公式
-        isolated = 14           # 行间公式
-        text = 15               # ocr 识别结果
-
-
-   class PageInfo(BaseModel):
-       page_no: int = Field(description="页码序号，第一页的序号是 0", ge=0)
-       height: int = Field(description="页面高度", gt=0)
-       width: int = Field(description="页面宽度", ge=0)
-
-   class ObjectInferenceResult(BaseModel):
-       category_id: CategoryType = Field(description="类别", ge=0)
-       poly: list[float] = Field(description="四边形坐标, 分别是 左上，右上，右下，左下 四点的坐标")
-       score: float = Field(description="推理结果的置信度")
-       latex: str | None = Field(description="latex 解析结果", default=None)
-       html: str | None = Field(description="html 解析结果", default=None)
-
-   class PageInferenceResults(BaseModel):
-        layout_dets: list[ObjectInferenceResult] = Field(description="页面识别结果", ge=0)
-        page_info: PageInfo = Field(description="页面元信息")
-
-
-   # 所有页面的推理结果按照页码顺序依次放到列表中即为 minerU 推理结果
-   inference_result: list[PageInferenceResults] = []
-
-poly 坐标的格式 [x0, y0, x1, y1, x2, y2, x3, y3],
-分别表示左上、右上、右下、左下四点的坐标 |poly 坐标示意图|
-
-示例数据
-^^^^^^^^
-
-.. code:: json
-
-   [
-       {
-           "layout_dets": [
-               {
-                   "category_id": 2,
-                   "poly": [
-                       99.1906967163086,
-                       100.3119125366211,
-                       730.3707885742188,
-                       100.3119125366211,
-                       730.3707885742188,
-                       245.81326293945312,
-                       99.1906967163086,
-                       245.81326293945312
-                   ],
-                   "score": 0.9999997615814209
-               }
-           ],
-           "page_info": {
-               "page_no": 0,
-               "height": 2339,
-               "width": 1654
-           }
-       },
-       {
-           "layout_dets": [
-               {
-                   "category_id": 5,
-                   "poly": [
-                       99.13092803955078,
-                       2210.680419921875,
-                       497.3183898925781,
-                       2210.680419921875,
-                       497.3183898925781,
-                       2264.78076171875,
-                       99.13092803955078,
-                       2264.78076171875
-                   ],
-                   "score": 0.9999997019767761
-               }
-           ],
-           "page_info": {
-               "page_no": 1,
-               "height": 2339,
-               "width": 1654
-           }
-       }
-   ]
-
-some_pdf_middle.json
-~~~~~~~~~~~~~~~~~~~~
-
-+--------------------+----------------------------------------------------------+
-| 字段名              | 解释                                                    |
-+====================+==========================================================+
-| pdf_info           | list，每个元素都是一个                                   |
-|                    | dict，这个dict是每一页pdf的解析结果，详见下表            |
-+--------------------+----------------------------------------------------------+
-| \_parse_type       | ocr \| txt，用来标识本次解析的中间态使用的模式           |
-+--------------------+----------------------------------------------------------+
-| \_version_name     | string，表示本次解析使用的 magic-pdf 的版本号            |
-+-------------------------------------------------------------------------------+
-
-**pdf_info** 字段结构说明
-
-+---------------------+-------------------------------------------------------+
-| 字段名               | 解释                                                 |
-+=====================+=======================================================+
-| preproc_blocks      | pdf预处理后，未分段的中间结果                         |
-+---------------------+-------------------------------------------------------+
-|                     | 布局分割的结果，                                      |
-| layout_bboxes       | 含有布局的方向（垂直、水平），和bbox，按阅读顺序排序  |
-+---------------------+-------------------------------------------------------+
-| page_idx            | 页码，从0开始                                         |
-+---------------------+-------------------------------------------------------+
-| page_size           | 页面的宽度和高度                                      |
-+---------------------+-------------------------------------------------------+
-| \_layout_tree       | 布局树状结构                                          |
-+---------------------+-------------------------------------------------------+
-| images              | list，每个元素是一个dict，每个dict表示一个img_block   |
-+---------------------+-------------------------------------------------------+
-| tables              | list，每个元素是一个dict，每个dict表示一个table_block |
-+---------------------+-------------------------------------------------------+
-|                     | list，每个元素是一个                                  |
-| interline_equations | dict，每个dict表示一个interline_equation_block        |
-+---------------------+-------------------------------------------------------+
-|                     | List, 模型返回的需要drop的block信息                   |
-| discarded_blocks    |                                                       |
-+---------------------+-------------------------------------------------------+
-| para_blocks         | 将preproc_blocks进行分段之后的结果                    |
-+---------------------+-------------------------------------------------------+
-
-上表中 ``para_blocks``
-是个dict的数组，每个dict是一个block结构，block最多支持一次嵌套
-
-**block**
-
-外层block被称为一级block，一级block中的字段包括
-
-====== ===============================================
-字段名 解释
-====== ===============================================
-type   block类型（table|image）
-bbox   block矩形框坐标
-blocks list，里面的每个元素都是一个dict格式的二级block
-====== ===============================================
-
-一级block只有”table”和”image”两种类型，其余block均为二级block
-
-二级block中的字段包括
-
-+----------+----------------------------------------------------------------+
-| 字       | 解释                                                           |
-| 段       |                                                                |
-| 名       |                                                                |
-+==========+================================================================+
-|          | block类型                                                      |
-| type     |                                                                |
-+----------+----------------------------------------------------------------+
-| bbox     | block矩形框坐标                                                |
-+----------+----------------------------------------------------------------+
-| lines    | list，每个元素都是一个dict表示的line，用来描述一行信息的构成   |
-+----------+----------------------------------------------------------------+
-
-二级block的类型详解
-
-================== ==============
-type               desc
-================== ==============
-image_body         图像的本体
-image_caption      图像的描述文本
-image_footnote     图像的脚注
-table_body         表格本体
-table_caption      表格的描述文本
-table_footnote     表格的脚注
-text               文本块
-title              标题块
-index              目录块
-list               列表块
-interline_equation 行间公式块
-================== ==============
-
-**line**
-
-line 的 字段格式如下
-
-+-----------+-----------------------------------------------------------------+
-| 字        | 解释                                                            |
-| 段        |                                                                 |
-| 名        |                                                                 |
-+===========+=================================================================+
-| bbox      | line的矩形框坐标                                                |
-+-----------+-----------------------------------------------------------------+
-| spans     | list，                                                          |
-|           | 每个元素都是一个dict表示的span，用来描述一个最小组成单元的构成  |
-+-----------+-----------------------------------------------------------------+
-
-**span**
-
-+------------+---------------------------------------------------------+
-| 字段名      | 解释                                                   |
-+============+=========================================================+
-| bbox       | span的矩形框坐标                                        |
-+------------+---------------------------------------------------------+
-| type       | span的类型                                              |
-+------------+---------------------------------------------------------+
-| content \| | 文本类型的span使用content，图表类使用img_path           |
-| img_path   | 用来存储实际的文本或者截图路径信息                      |
-+------------+---------------------------------------------------------+
-
-span 的类型有如下几种
-
-================== ========
-type               desc
-================== ========
-image              图片
-table              表格
-text               文本
-inline_equation    行内公式
-interline_equation 行间公式
-================== ========
-
-**总结**
-
-span是所有元素的最小存储单元
-
-para_blocks内存储的元素为区块信息
-
-区块结构为
-
-一级block(如有)->二级block->line->span
-
-.. _示例数据-1:
-
-示例数据
-^^^^^^^^
-
-.. code:: json
-
-   {
-       "pdf_info": [
-           {
-               "preproc_blocks": [
-                   {
-                       "type": "text",
-                       "bbox": [
-                           52,
-                           61.956024169921875,
-                           294,
-                           82.99800872802734
-                       ],
-                       "lines": [
-                           {
-                               "bbox": [
-                                   52,
-                                   61.956024169921875,
-                                   294,
-                                   72.0000228881836
-                               ],
-                               "spans": [
-                                   {
-                                       "bbox": [
-                                           54.0,
-                                           61.956024169921875,
-                                           296.2261657714844,
-                                           72.0000228881836
-                                       ],
-                                       "content": "dependent on the service headway and the reliability of the departure ",
-                                       "type": "text",
-                                       "score": 1.0
-                                   }
-                               ]
-                           }
-                       ]
-                   }
-               ],
-               "layout_bboxes": [
-                   {
-                       "layout_bbox": [
-                           52,
-                           61,
-                           294,
-                           731
-                       ],
-                       "layout_label": "V",
-                       "sub_layout": []
-                   }
-               ],
-               "page_idx": 0,
-               "page_size": [
-                   612.0,
-                   792.0
-               ],
-               "_layout_tree": [],
-               "images": [],
-               "tables": [],
-               "interline_equations": [],
-               "discarded_blocks": [],
-               "para_blocks": [
-                   {
-                       "type": "text",
-                       "bbox": [
-                           52,
-                           61.956024169921875,
-                           294,
-                           82.99800872802734
-                       ],
-                       "lines": [
-                           {
-                               "bbox": [
-                                   52,
-                                   61.956024169921875,
-                                   294,
-                                   72.0000228881836
-                               ],
-                               "spans": [
-                                   {
-                                       "bbox": [
-                                           54.0,
-                                           61.956024169921875,
-                                           296.2261657714844,
-                                           72.0000228881836
-                                       ],
-                                       "content": "dependent on the service headway and the reliability of the departure ",
-                                       "type": "text",
-                                       "score": 1.0
-                                   }
-                               ]
-                           }
-                       ]
-                   }
-               ]
-           }
-       ],
-       "_parse_type": "txt",
-       "_version_name": "0.6.1"
-   }
-
-.. |poly 坐标示意图| image:: ../../_static/image/poly.png
--- a/next_docs/zh_cn/user_guide/tutorial/pipeline.rst
+++ b/next_docs/zh_cn/user_guide/tutorial/pipeline.rst
-
-流水线管道
-===========
-
-
-极简示例
-^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-
-    # args
-    pdf_file_name = "abc.pdf"  # replace with the real pdf path
-    name_without_suff = pdf_file_name.split(".")[0]
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-    image_dir = str(os.path.basename(local_image_dir))
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-
-运行以上的代码，会得到如下的结果
-
-.. code:: bash 
-
-    output/
-    ├── abc.md
-    └── images
-
-
-除去初始化环境，如建立目录、导入依赖库等逻辑。真正将 ``pdf`` 转换为 ``markdown`` 的代码片段如下
-
-.. code::
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-
-``ds.apply(doc_analyze, ocr=True)`` 会生成 ``InferenceResult`` 对象。 ``InferenceResult`` 对象执行 ``pipe_ocr_mode`` 方法会生成 ``PipeResult`` 对象。
-``PipeResult`` 对象执行 ``dump_md`` 会在指定位置生成 ``markdown`` 文件。
-
-
-pipeline 的执行过程如下图所示
-
-.. image:: ../../_static/image/pipeline.drawio.svg 
-
-.. raw:: html 
-
-    <br> </br>
-
-目前划分出数据、推理、程序处理三个阶段，分别对应着图上的 ``Dataset``， ``InferenceResult``， ``PipeResult`` 这三个实体。通过 ``apply`` ， ``doc_analyze`` 或 ``pipe_ocr_mode`` 等方法链接在一起。
-
-
-.. admonition:: Tip
-    :class: tip
-
-    要想获得更多有关 Dataset、InferenceResult、PipeResult 的使用示例子，请前往 :doc:`../quick_start/to_markdown`
-
-    要想获得更多有关 Dataset、InferenceResult、PipeResult 的细节信息请前往英文版 MinerU 文档进行查看!
-
-
-
-管道组合
-^^^^^^^^^
-
-.. code:: python
-
-    class Dataset(ABC):
-        @abstractmethod
-        def apply(self, proc: Callable, *args, **kwargs):
-            """Apply callable method which.
-
-            Args:
-                proc (Callable): invoke proc as follows:
-                    proc(self, *args, **kwargs)
-
-            Returns:
-                Any: return the result generated by proc
-            """
-            pass
-
-    class InferenceResult(InferenceResultBase):
-
-        def apply(self, proc: Callable, *args, **kwargs):
-            """Apply callable method which.
-
-            Args:
-                proc (Callable): invoke proc as follows:
-                    proc(inference_result, *args, **kwargs)
-
-            Returns:
-                Any: return the result generated by proc
-            """
-            return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
-
-        def pipe_ocr_mode(
-            self,
-            imageWriter: DataWriter,
-            start_page_id=0,
-            end_page_id=None,
-            debug_mode=False,
-            lang=None,
-            ) -> PipeResult:
-            pass
-
-    class PipeResult:
-        def apply(self, proc: Callable, *args, **kwargs):
-            """Apply callable method which.
-
-            Args:
-                proc (Callable): invoke proc as follows:
-                    proc(pipeline_result, *args, **kwargs)
-
-            Returns:
-                Any: return the result generated by proc
-            """
-            return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
-
-``Dataset`` 、 ``InferenceResult`` 和 ``PipeResult`` 类均有 ``apply`` method。可用于组合不同阶段的运算过程。
-如下所示，``MinerU`` 提供一套组合这些类的计算过程。
-
-.. code:: python 
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-用户可以根据的需求，自行实现一些组合用的函数。比如用户通过 ``apply`` 方法实现一个统计 ``pdf`` 文件页数的功能。
-
-.. code:: python 
-
-    from magic_pdf.data.data_reader_writer import  FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-
-    # args
-    pdf_file_name = "abc.pdf"  # replace with the real pdf path
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    def count_page(ds)-> int:
-        return len(ds)
-
-    print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`
--- a/projects/README.md
+++ b/projects/README.md
@@ -2,8 +2,10 @@

 ## Project List

- [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
- [gradio_app](./gradio_app/README.md): Build a web app based on gradio
- ~~[web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version~~(Deprecated)
- [web_api](./web_api/README.md): Web API Based on FastAPI
- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
+- Projects compatible with version 2.0:
+  - [gradio_app](./gradio_app/README.md): Web application based on Gradio
+
+- Projects not yet compatible with version 2.0:
+  - [web_api](./web_api/README.md): Web API based on FastAPI
+  - [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
+
--- a/projects/README_zh-CN.md
+++ b/projects/README_zh-CN.md
@@ -2,8 +2,9 @@

 ## 项目列表

- [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
- ~~[web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本~~(已过时)
- [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
+- 已兼容2.0版本的项目列表
+  - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
+
+- 未兼容2.0版本的项目列表
+  - [web_api](./web_api/README.md): 基于 FastAPI 的 Web API 
+  - [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
--- a/projects/llama_index_rag/README.md
+++ b/projects/llama_index_rag/README.md
-## Installation
-
-MinerU
-
-```bash
-git clone https://github.com/opendatalab/MinerU.git
-cd MinerU
-
-conda create -n MinerU python=3.10
-conda activate MinerU
-pip install .[full] --extra-index-url https://wheels.myhloli.com
-```
-
-Third-party software
-
-```bash
-# install
-pip install llama-index-vector-stores-elasticsearch==0.2.0
-pip install llama-index-embeddings-dashscope==0.2.0
-pip install llama-index-core==0.10.68
-pip install einops==0.7.0
-pip install transformers-stream-generator==0.0.5
-pip install accelerate==0.33.0
-
-# uninstall
-pip uninstall transformer-engine
-```
-
-## Environment Configuration
-
-```
-export DASHSCOPE_API_KEY={some_key}
-export ES_USER={some_es_user}
-export ES_PASSWORD={some_es_password}
-export ES_URL=http://{es_url}:9200
-```
-For instructions on obtaining a DASHSCOPE_API_KEY, refer to [documentation](https://help.aliyun.com/zh/dashscope/opening-service)
-
-## Usage
-
-### Data Ingestion
-
-```bash
-python data_ingestion.py -p some.pdf  # load data from pdf
-
-    or
-
-python data_ingestion.py -p /opt/data/some_pdf_directory/ # load data from multiples pdf which under the directory of {some_pdf_directory}
-```
-
-### Query
-
-```bash
-python query.py --question '{the_question_you_want_to_ask}'
-```
-
-## Example
-
-````bash
-# Start the es service
-docker compose up -d
-
-or
-
-docker-compose up -d
-
-
-# Set environment variables
-export ES_USER=elastic
-export ES_PASSWORD=llama_index
-export ES_URL=http://127.0.0.1:9200
-export DASHSCOPE_API_KEY={some_key}
-
-
-# Ingest data
-python data_ingestion.py example/data/declaration_of_the_rights_of_man_1789.pdf
-
-
-# Ask a question
-python query.py -q 'how about the rights of men'
-
-## outputs
-Please answer the question based on the content within ```:
-            ```
-            I. Men are born, and always continue, free and equal in respect of their rights. Civil distinctions, therefore, can be founded only on public utility.
-            ```
-            My question is：how about the rights of men。
-
-question: how about the rights of men
-answer: The statement implies that men are born free and equal in terms of their rights. Civil distinctions should only be based on public utility. However, it does not specify what those rights are. It is up to society and individual countries to determine and protect the specific rights of their citizens.
-
-````
-
-## Development
-
-`MinerU` provides a `RAG` integration interface, allowing users to specify a single input `pdf` file or a directory. `MinerU` will automatically parse the input files and return an iterable interface for retrieving the data.
-
-
-### API Interface
-
-```python
-from magic_pdf.integrations.rag.type import Node
-
-class RagPageReader:
-    def get_rel_map(self) -> list[ElementRelation]:
-        # Retrieve the relationships between nodes
-        pass
-    ...
-
-class RagDocumentReader:
-    ...
-
-class DataReader:
-    def __init__(self, path_or_directory: str, method: str, output_dir: str):
-        pass
-
-    def get_documents_count(self) -> int:
-        """Get the number of pdf documents"""
-        pass
-
-    def get_document_result(self, idx: int) -> RagDocumentReader | None:
-        """Retrieve the parsed content of a specific pdf"""
-        pass
-
-
-    def get_document_filename(self, idx: int) -> Path:
-        """Retrieve the path of a specific pdf"""
-        pass
-
-
-```
-
-Type Definitions
-
-```python
-
-
-class Node(BaseModel):
-    category_type: CategoryType = Field(description='Category') # Category
-    text: str | None = Field(description='Text content', default=None)
-    image_path: str | None = Field(description='Path to image or table (table may be stored as an image)', default=None)
-    anno_id: int = Field(description='Unique ID', default=-1)
-    latex: str | None = Field(description='LaTeX output for equations or tables', default=None)
-    html: str | None = Field(description='HTML output for tables', default=None)
-
-
-
-```
-
-Tables can be stored in one of three formats: image, LaTeX, or HTML. 
-`anno_id` is a globally unique ID for each Node. It can be used later to match this Node with other Nodes. The relationships between nodes can be retrieved using the `get_rel_map` method. Users can use `anno_id` to link nodes and construct a RAG index that includes node relationships.
-
-
-### Node Relationship Matrix
-
-|                | image_body | table_body |
-| -------------- | ---------- | ---------- |
-| image_caption  | sibling    |            |
-| table_caption  |            | sibling    |
-| table_footnote |            | sibling    |
--- a/projects/llama_index_rag/README_zh-CN.md
+++ b/projects/llama_index_rag/README_zh-CN.md
-<details open="open">
-  <summary><h2 style="display: inline-block">目录</h2></summary>
-    <li><a href="#介绍">介绍</a></li>
-    <li><a href="#安装">安装</a></li>
-    <li><a href="#示例">示例</a></li>
-    <li><a href="#开发">开发</a></li>
-  </ol>
-</details>
-
-## 介绍
-
-`MinerU` 提供数据 `API接口` 以支持用户导入数据到 `RAG` 系统。本项目将基于`通义千问`展示如何构建一个轻量级的 `RAG` 系统。
-
-<p align="center">
-  <img src="rag_data_api.png" width="300px" style="vertical-align:middle;">
-</p>
-
-## 安装
-
-环境要求
-
-```text
-NVIDIA A100 80GB,
-Centos 7 3.10.0-957.el7.x86_64
-
-Client: Docker Engine - Community
- Version:           24.0.5
- API version:       1.43
- Go version:        go1.20.6
- Git commit:        ced0996
- Built:             Fri Jul 21 20:39:02 2023
- OS/Arch:           linux/amd64
- Context:           default
-
-Server: Docker Engine - Community
- Engine:
-  Version:          24.0.5
-  API version:      1.43 (minimum version 1.12)
-  Go version:       go1.20.6
-  Git commit:       a61e2b4
-  Built:            Fri Jul 21 20:38:05 2023
-  OS/Arch:          linux/amd64
-  Experimental:     false
- containerd:
-  Version:          1.6.25
-  GitCommit:        d8f198a4ed8892c764191ef7b3b06d8a2eeb5c7f
- runc:
-  Version:          1.1.10
-  GitCommit:        v1.1.10-0-g18a0cb0
- docker-init:
-  Version:          0.19.0
-  GitCommit:        de40ad0
-```
-
-请参考[文档](../../README_zh-CN.md) 安装 MinerU
-
-第三方软件
-
-```bash
-# install
-pip install modelscope==1.14.0
-pip install llama-index-vector-stores-elasticsearch==0.2.0
-pip install llama-index-embeddings-dashscope==0.2.0
-pip install llama-index-core==0.10.68
-pip install einops==0.7.0
-pip install transformers-stream-generator==0.0.5
-pip install accelerate==0.33.0
-
-# uninstall
-pip uninstall transformer-engine
-```
-
-
-## 示例
-
-````bash
-cd  projects/llama_index_rag
-
-docker compose up -d
-
-or
-
-docker-compose up -d
-
-
-# 配置环境变量
-
-export ES_USER=elastic
-export ES_PASSWORD=llama_index
-export ES_URL=http://127.0.0.1:9200
-export DASHSCOPE_API_KEY={some_key}
-
-
-DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
-
-# 未导入数据，查询问题。返回通义千问默认答案
-python query.py -q 'how about the rights of men'
-
-## outputs
-question: how about the rights of men
-answer: The topic of men's rights often refers to discussions around legal, social, and political issues that affect men specifically or differently from women. Movements related to men's rights advocate for addressing areas where men face discrimination or unique challenges, such as:
-
-    Child Custody: Ensuring that men have equal opportunities for custody of their children following divorce or separation.
-    Domestic Violence: Recognizing that men can also be victims of domestic abuse and ensuring they have access to support services.
-    Mental Health and Suicide Rates: Addressing the higher rates of suicide among men and providing mental health resources.
-    Military Conscription: In some countries, only men are required to register for military service, which is seen as a gender-based obligation.
-    Workplace Safety: Historically, more men than women have been employed in high-risk occupations, leading to higher workplace injury and death rates.
-    Parental Leave: Advocating for paternity leave policies that allow men to take time off work for family care.
-    Men's rights activism often intersects with broader discussions on gender equality and aims to promote fairness and equity across genders. It's important to note that while advocating for these issues, it should be done in a way that does not detract from or oppose the goals of gender equality and the rights of other groups. The focus should be on creating a fair society where everyone has equal opportunities and protections under the law.
-
-
-# 导入数据
-python data_ingestion.py -p example/data/
-
-or
-
-python data_ingestion.py -p example/data/declaration_of_the_rights_of_man_1789.pdf
-
-
-# 导入数据后，查询问题。通义千问模型会根据 RAG 系统的检索结果，结合上下文，给出答案。
-
-python query.py -q 'how about the rights of men'
-
-## outputs
-请基于```内的内容回答问题。"
-            ```
-            I. Men are born, and always continue, free and equal in respect of their rights. Civil distinctions, therefore, can be founded only on public utility.
-            ```
-            我的问题是：how about the rights of men。
-
-question: how about the rights of men
-answer: The statement implies that men are born free and equal in terms of their rights. Civil distinctions should only be based on public utility. However, it does not specify what those rights are. It is up to society and individual countries to determine and protect the specific rights of their citizens.
-
-````
-
-## 开发
-
-`MinerU` 提供了 `RAG` 集成接口，用户可以通过指定输入单个 `pdf` 文件或者某个目录。`MinerU` 会自动解析输入文件并返回可以迭代的接口用于获取数据
-
-### API 接口
-
-```python
-from magic_pdf.integrations.rag.type import Node
-
-class RagPageReader:
-    def get_rel_map(self) -> list[ElementRelation]:
-        # 获取节点间的关系
-        pass
-    ...
-
-class RagDocumentReader:
-    ...
-
-class DataReader:
-    def __init__(self, path_or_directory: str, method: str, output_dir: str):
-        pass
-
-    def get_documents_count(self) -> int:
-        """获取 pdf 文档数量"""
-        pass
-
-    def get_document_result(self, idx: int) -> RagDocumentReader | None:
-        """获取某个 pdf 的解析内容"""
-        pass
-
-
-    def get_document_filename(self, idx: int) -> Path:
-        """获取某个 pdf 的具体路径"""
-        pass
-
-
-```
-
-类型定义
-
-```python
-
-class Node(BaseModel):
-    category_type: CategoryType = Field(description='类别') # 类别
-    text: str | None = Field(description='文本内容',
-                             default=None)
-    image_path: str | None = Field(description='图或者表格（表可能用图片形式存储）的存储路径',
-                                   default=None)
-    anno_id: int = Field(description='unique id', default=-1)
-    latex: str | None = Field(description='公式或表格 latex 解析结果', default=None)
-    html: str | None = Field(description='表格的 html 解析结果', default=None)
-
-```
-
-表格存储形式可能会是 图片、latex、html 三种形式之一。
-anno_id 是该 Node 的在全局唯一ID。后续可以用于匹配该 Node 和其他 Node 的关系。节点的关系可以通过方法 `get_rel_map` 获取。用户可以用 `anno_id` 匹配节点之间的关系，并用于构建具备节点的关系的 rag index。
-
-### 节点类型关系矩阵
-
-|                | image_body | table_body |
-| -------------- | ---------- | ---------- |
-| image_caption  | sibling    |            |
-| table_caption  |            | sibling    |
-| table_footnote |            | sibling    |
--- a/projects/llama_index_rag/data_ingestion.py
+++ b/projects/llama_index_rag/data_ingestion.py
-import os
-
-import click
-from llama_index.core.schema import TextNode
-from llama_index.embeddings.dashscope import (DashScopeEmbedding,
-                                              DashScopeTextEmbeddingModels,
-                                              DashScopeTextEmbeddingType)
-from llama_index.vector_stores.elasticsearch import ElasticsearchStore
-
-from magic_pdf.integrations.rag.api import DataReader
-
-es_vec_store = ElasticsearchStore(
-    index_name='rag_index',
-    es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
-    es_user=os.getenv('ES_USER', 'elastic'),
-    es_password=os.getenv('ES_PASSWORD', 'llama_index'),
-)
-
-
-# Create embeddings
-# text_type=`document` to build index
-def embed_node(node):
-    embedder = DashScopeEmbedding(
-        model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
-        text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
-    )
-
-    result_embeddings = embedder.get_text_embedding(node.text)
-    node.embedding = result_embeddings
-    return node
-
-
-@click.command()
-@click.option(
-    '-p',
-    '--path',
-    'path',
-    type=click.Path(exists=True),
-    required=True,
-    help='local pdf filepath or directory',
-)
-def cli(path):
-    output_dir = '/tmp/magic_pdf/integrations/rag/'
-    os.makedirs(output_dir, exist_ok=True)
-    documents = DataReader(path, 'ocr', output_dir)
-
-    # build nodes
-    nodes = []
-
-    for idx in range(documents.get_documents_count()):
-        doc = documents.get_document_result(idx)
-        if doc is None:  # something wrong happens when parse pdf !
-            continue
-
-        for page in iter(
-                doc):  # iterate documents from initial page to last page !
-            for element in iter(page):  # iterate the element from all page !
-                if element.text is None:
-                    continue
-                nodes.append(
-                    embed_node(
-                        TextNode(text=element.text,
-                                 metadata={'purpose': 'demo'})))
-    es_vec_store.add(nodes)
-
-
-if __name__ == '__main__':
-    cli()
--- a/projects/llama_index_rag/docker-compose.yml
+++ b/projects/llama_index_rag/docker-compose.yml
-services:
-  es:
-    container_name: es
-    image: docker.elastic.co/elasticsearch/elasticsearch:8.11.3
-    volumes:
-      - esdata01:/usr/share/elasticsearch/data
-    ports:
-      - 9200:9200
-    environment:
-      - node.name=es
-      - ELASTIC_PASSWORD=llama_index
-      - bootstrap.memory_lock=false
-      - discovery.type=single-node
-      - xpack.security.enabled=true
-      - xpack.security.http.ssl.enabled=false
-      - xpack.security.transport.ssl.enabled=false
-    ulimits:
-      memlock:
-        soft: -1
-        hard: -1
-    restart: always
-volumes:
-  esdata01:
-    driver: local
--- a/projects/llama_index_rag/example/data/declaration_of_the_rights_of_man_1789.pdf
+++ b/projects/llama_index_rag/example/data/declaration_of_the_rights_of_man_1789.pdf
--- a/projects/llama_index_rag/query.py
+++ b/projects/llama_index_rag/query.py
-import os
-
-import click
-from llama_index.core.vector_stores.types import VectorStoreQuery
-from llama_index.embeddings.dashscope import (DashScopeEmbedding,
-                                              DashScopeTextEmbeddingModels,
-                                              DashScopeTextEmbeddingType)
-from llama_index.vector_stores.elasticsearch import (AsyncDenseVectorStrategy,
-                                                     ElasticsearchStore)
-# initialize qwen 7B model
-from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
-
-es_vector_store = ElasticsearchStore(
-    index_name='rag_index',
-    es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
-    es_user=os.getenv('ES_USER', 'elastic'),
-    es_password=os.getenv('ES_PASSWORD', 'llama_index'),
-    retrieval_strategy=AsyncDenseVectorStrategy(),
-)
-
-
-def embed_text(text):
-    embedder = DashScopeEmbedding(
-        model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
-        text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
-    )
-    return embedder.get_text_embedding(text)
-
-
-def search(vector_store: ElasticsearchStore, query: str):
-    query_vec = VectorStoreQuery(query_embedding=embed_text(query))
-    result = vector_store.query(query_vec)
-    return '\n'.join([node.text for node in result.nodes])
-
-
-@click.command()
-@click.option(
-    '-q',
-    '--question',
-    'question',
-    required=True,
-    help='ask what you want to know!',
-)
-def cli(question):
-    tokenizer = AutoTokenizer.from_pretrained('qwen/Qwen-7B-Chat',
-                                              revision='v1.0.5',
-                                              trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained('qwen/Qwen-7B-Chat',
-                                                 revision='v1.0.5',
-                                                 device_map='auto',
-                                                 trust_remote_code=True,
-                                                 fp32=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(
-        'Qwen/Qwen-7B-Chat', revision='v1.0.5', trust_remote_code=True)
-
-    # define a prompt template for the vectorDB-enhanced LLM generation
-    def answer_question(question, context, model):
-        if context == '':
-            prompt = question
-        else:
-            prompt = f'''请基于```内的内容回答问题。"
-            ```
-            {context}
-            ```
-            我的问题是：{question}。
-            '''
-        history = None
-        print(prompt)
-        response, history = model.chat(tokenizer, prompt, history=None)
-        return response
-
-    answer = answer_question(question, search(es_vector_store, question),
-                             model)
-    print(f'question: {question}\n'
-          f'answer: {answer}')
-
-
-"""
-
-python query.py -q 'how about the rights of men'
-"""
-
-if __name__ == '__main__':
-    cli()
--- a/projects/llama_index_rag/rag_data_api.png
+++ b/projects/llama_index_rag/rag_data_api.png
--- a/projects/web/.gitignore
+++ b/projects/web/.gitignore
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-pnpm-debug.log*
-lerna-debug.log*
-
-node_modules
-dist
-dist-ssr
-*.local
-
-# Editor directories and files
-.vscode/*
-!.vscode/extensions.json
-.idea
-.DS_Store
-*.suo
-*.ntvs*
-*.njsproj
-*.sln
-*.sw?
--- a/projects/web/README.md
+++ b/projects/web/README.md
-# MinerU web
-
-## Table of Contents
- [Local Frontend Development](#local-frontend-development)
- [Technology Stack](#technology-stack)
-
-## Local Frontend Development
-
-### Prerequisites
- Node.js 18.x
- pnpm
-
-### Installation Steps
-
-1. Install Node.js 18
-   - Visit the [Node.js official website](https://nodejs.org/) to download and install Node.js version 18.x
-
-2. Install pnpm
-   ```bash
-   npm install -g pnpm
-3. Clone the repository
-    ```git clone https://github.com/opendatalab/MinerU
-    cd ./projects/web
-    ```
-4. Install dependencies
-    ```
-    pnpm install
-    ```
-5. Run the development server
-    ```
-    pnpm run dev
-    ```
-6. ⚠️ Note: This command is for local development only, do not use for deployment!
-Open your browser and visit http://localhost:5173 (or another address output in the console)
-
-7. Ensure that the backend service in ./projects/web_demo is running
-
-8. If you encounter an error when executing `pnpm install`, you can switch to an alternative package manager.
-   ```
-   npm install -g yarn
-   yarn
-   yarn start
-   ```
-
-
-##  Building the Project
-```
-pnpm run build
-```
-## Technology Stack
- React
- Tailwind CSS
- typeScript
- zustand
- ahooks
--- a/projects/web/README_zh-CN.md
+++ b/projects/web/README_zh-CN.md
-# MinerU web 
-
-
-## 目录
- [前端本地开发](#前端本地开发)
- [技术栈](#技术栈)
-## 前端本地开发
-
-### 前置条件
- Node.js 18.x
- pnpm
-
-### 安装步骤
-
-1. 安装 Node.js 18
-   - 访问 [Node.js 官网](https://nodejs.org/) 下载并安装 Node.js 18.x 版本
-
-2. 安装 pnpm
-   ```bash
-   npm install -g pnpm
-   ```
-3. 克隆仓库
-   ```
-   1. git clone https://github.com/opendatalab/MinerU
-   2. cd ./projects/web
-   ```
-
-4. 安装依赖
-   ```
-   pnpm install
-   ```
-
-5. 运行开发服务器
-   ```
-   pnpm run dev
-   ```
-
-6. ⚠️ 注意：此命令仅用于本地开发，不要用于部署！
-打开浏览器访问 http://localhost:5173（或控制台输出的其他地址）
-构建项目
-要构建生产版本，请执行以下命令：
-
-   ```
-   pnpm run build
-   ```
-7. 请确保./projects/web_demo后端服务启动
-
-8. 如果pnpm install执行error，可更换包管理器
-   ```
-   npm install -g yarn
-   yarn
-   yarn start
-   ```
-
-## 技术栈
-
- React
- Tailwind CSS
- typeScript
- zustand
- ahooks
--- a/projects/web/eslint.config.js
+++ b/projects/web/eslint.config.js
-import js from '@eslint/js'
-import globals from 'globals'
-import reactHooks from 'eslint-plugin-react-hooks'
-import reactRefresh from 'eslint-plugin-react-refresh'
-import tseslint from 'typescript-eslint'
-
-export default tseslint.config(
-  { ignores: ['dist'] },
-  {
-    extends: [js.configs.recommended, ...tseslint.configs.recommended],
-    files: ['**/*.{ts,tsx}'],
-    languageOptions: {
-      ecmaVersion: 2020,
-      globals: globals.browser,
-    },
-    plugins: {
-      'react-hooks': reactHooks,
-      'react-refresh': reactRefresh,
-    },
-    rules: {
-      ...reactHooks.configs.recommended.rules,
-      'react-refresh/only-export-components': [
-        'warn',
-        { allowConstantExport: true },
-      ],
-    },
-  },
-)
--- a/projects/web/index.html
+++ b/projects/web/index.html
-<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <meta charset="UTF-8" />
-    <link rel="icon" type="image/svg+xml" href="/logo.svg" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>MinerU</title>
-  </head>
-  <body>
-    <div id="root"></div>
-    <script type="module" src="/src/main.tsx"></script>
-  </body>
-</html>
--- a/projects/web/package-lock.json
+++ b/projects/web/package-lock.json
--- a/projects/web/package.json
+++ b/projects/web/package.json
-{
-  "name": "my-react-app",
-  "private": true,
-  "version": "0.0.0",
-  "type": "module",
-  "scripts": {
-    "dev": "vite --host ",
-    "build": "tsc --noEmit && vite build",
-    "lint": "eslint .",
-    "preview": "vite preview"
-  },
-  "dependencies": {
-    "@ant-design/icons": "^5.4.0",
-    "@codemirror/view": "^6.33.0",
-    "@tanstack/react-query": "^5.56.2",
-    "@types/lodash": "^4.17.7",
-    "@types/qs": "^6.9.15",
-    "@types/react-copy-to-clipboard": "^5.0.7",
-    "@types/react-syntax-highlighter": "^15.5.13",
-    "@uiw/codemirror-extensions-langs": "^4.23.0",
-    "@uiw/react-codemirror": "^4.23.0",
-    "ahooks": "^3.8.1",
-    "antd": "^5.20.3",
-    "axios": "^1.7.5",
-    "canvas": "^2.11.2",
-    "classnames": "^2.5.1",
-    "js-cookie": "^3.0.5",
-    "lodash": "^4.17.21",
-    "path2d": "^0.2.1",
-    "qs": "^6.13.0",
-    "react": "^18.3.1",
-    "react-copy-to-clipboard": "^5.1.0",
-    "react-dom": "^18.3.1",
-    "react-intl": "^6.6.8",
-    "react-markdown": "^9.0.1",
-    "react-query": "^3.39.3",
-    "react-router-dom": "^6.26.1",
-    "react-syntax-highlighter": "^15.5.0",
-    "rehype-katex": "^7.0.1",
-    "rehype-raw": "^7.0.0",
-    "remark-gfm": "^4.0.0",
-    "remark-math": "^6.0.0",
-    "zustand": "^4.5.5"
-  },
-  "devDependencies": {
-    "@eslint/js": "^9.9.0",
-    "@types/js-cookie": "^3.0.6",
-    "@types/node": "^22.5.1",
-    "@types/react": "^18.3.3",
-    "@types/react-dom": "^18.3.0",
-    "@vitejs/plugin-react": "^4.3.1",
-    "autoprefixer": "^10.4.20",
-    "eslint": "^9.9.0",
-    "eslint-plugin-react-hooks": "^5.1.0-rc.0",
-    "eslint-plugin-react-refresh": "^0.4.9",
-    "globals": "^15.9.0",
-    "less": "^4.2.0",
-    "postcss": "^8.4.41",
-    "sass-embedded": "^1.77.8",
-    "tailwindcss": "^3.4.10",
-    "ts-prune": "^0.10.3",
-    "typescript": "^5.5.3",
-    "typescript-eslint": "^8.0.1",
-    "vite": "^5.4.1"
-  }
-}
\ No newline at end of file