Merge pull request #879 from opendatalab/release-0.9.1

Release 0.9.1

Merge pull request #879 from opendatalab/release-0.9.1
Release 0.9.1
069bcfe6 · Xiaomeng Zhao · GitHub · 8ee1da82 · bff7bd93 · 069bcfe6
Unverified Commit 069bcfe6 authored Nov 06, 2024 by Xiaomeng Zhao Committed by GitHub Nov 06, 2024
15 changed files
--- a/next_docs/en/user_guide/quick_start/extract_text.rst
+++ b/next_docs/en/user_guide/quick_start/extract_text.rst
+Extract Content from Pdf
+========================
+.. code:: python
+    from magic_pdf.data.read_api import read_local_pdfs
+    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
--- a/next_docs/en/user_guide/quick_start/to_markdown.rst
+++ b/next_docs/en/user_guide/quick_start/to_markdown.rst
+Convert To Markdown
+========================
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+    from magic_pdf.pipe.OCRPipe import OCRPipe
+    ## args
+    model_list = []
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+    ## prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    ) # create 00
+    image_dir = str(os.path.basename(local_image_dir))
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)   # read the pdf content
+    pipe = OCRPipe(pdf_bytes, model_list, image_writer)
+    pipe.pipe_classify()
+    pipe.pipe_analyze()
+    pipe.pipe_parse()
+    pdf_info = pipe.pdf_mid_data["pdf_info"]
+    md_content = pipe.pipe_mk_markdown(
+        image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
+    )
+    if isinstance(md_content, list):
+        md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
+    else:
+        md_writer.write_string(f"{pdf_file_name}.md", md_content)
+Check :doc:`../data/data_reader_writer` for more [reader | writer] examples 
--- a/next_docs/en/user_guide/tutorial.rst
+++ b/next_docs/en/user_guide/tutorial.rst
+Tutorial
+===========
+From the beginning to the end, Show how to using mineru via a minimal project
+.. toctree::
+    :maxdepth: 1
+    tutorial/output_file_description
\ No newline at end of file
--- a/next_docs/en/user_guide/tutorial/output_file_description.rst
+++ b/next_docs/en/user_guide/tutorial/output_file_description.rst
+Output File Description
+=========================
+After executing the ``magic-pdf`` command, in addition to outputting
+files related to markdown, several other files unrelated to markdown
+will also be generated. These files will be introduced one by one.
+some_pdf_layout.pdf
+~~~~~~~~~~~~~~~~~~~
+Each page layout consists of one or more boxes. The number at the top
+left of each box indicates its sequence number. Additionally, in
+``layout.pdf``, different content blocks are highlighted with different
+background colors.
+.. figure:: ../../_static/image/layout_example.png
+   :alt: layout example
+   layout example
+some_pdf_spans.pdf
+~~~~~~~~~~~~~~~~~~
+All spans on the page are drawn with different colored line frames
+according to the span type. This file can be used for quality control,
+allowing for quick identification of issues such as missing text or
+unrecognized inline formulas.
+.. figure:: ../../_static/image/spans_example.png
+   :alt: spans example
+   spans example
+some_pdf_model.json
+~~~~~~~~~~~~~~~~~~~
+Structure Definition
+^^^^^^^^^^^^^^^^^^^^
+.. code:: python
+   from pydantic import BaseModel, Field
+   from enum import IntEnum
+   class CategoryType(IntEnum):
+        title = 0               # Title
+        plain_text = 1          # Text
+        abandon = 2             # Includes headers, footers, page numbers, and page annotations
+        figure = 3              # Image
+        figure_caption = 4      # Image description
+        table = 5               # Table
+        table_caption = 6       # Table description
+        table_footnote = 7      # Table footnote
+        isolate_formula = 8     # Block formula
+        formula_caption = 9     # Formula label
+        embedding = 13          # Inline formula
+        isolated = 14           # Block formula
+        text = 15               # OCR recognition result
+   class PageInfo(BaseModel):
+       page_no: int = Field(description="Page number, the first page is 0", ge=0)
+       height: int = Field(description="Page height", gt=0)
+       width: int = Field(description="Page width", ge=0)
+   class ObjectInferenceResult(BaseModel):
+       category_id: CategoryType = Field(description="Category", ge=0)
+       poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
+       score: float = Field(description="Confidence of the inference result")
+       latex: str | None = Field(description="LaTeX parsing result", default=None)
+       html: str | None = Field(description="HTML parsing result", default=None)
+   class PageInferenceResults(BaseModel):
+        layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
+        page_info: PageInfo = Field(description="Page metadata")
+   # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
+   inference_result: list[PageInferenceResults] = []
+The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
+representing the coordinates of the top-left, top-right, bottom-right,
+and bottom-left points respectively. |Poly Coordinate Diagram|
+example
+^^^^^^^
+.. code:: json
+   [
+       {
+           "layout_dets": [
+               {
+                   "category_id": 2,
+                   "poly": [
+                       99.1906967163086,
+                       100.3119125366211,
+                       730.3707885742188,
+                       100.3119125366211,
+                       730.3707885742188,
+                       245.81326293945312,
+                       99.1906967163086,
+                       245.81326293945312
+                   ],
+                   "score": 0.9999997615814209
+               }
+           ],
+           "page_info": {
+               "page_no": 0,
+               "height": 2339,
+               "width": 1654
+           }
+       },
+       {
+           "layout_dets": [
+               {
+                   "category_id": 5,
+                   "poly": [
+                       99.13092803955078,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2264.78076171875,
+                       99.13092803955078,
+                       2264.78076171875
+                   ],
+                   "score": 0.9999997019767761
+               }
+           ],
+           "page_info": {
+               "page_no": 1,
+               "height": 2339,
+               "width": 1654
+           }
+       }
+   ]
+some_pdf_middle.json
+~~~~~~~~~~~~~~~~~~~~
+-------+--------------------------------------------------------------+
+| Field | Description                                                  |
+| Name  |                                                              |
+=======+==============================================================+
+| pdf   | list, each element is a dict representing the parsing result |
+| _info | of each PDF page, see the table below for details            |
+-------+--------------------------------------------------------------+
+| \_    | ocr \| txt, used to indicate the mode used in this           |
+| parse | intermediate parsing state                                   |
+| _type |                                                              |
+-------+--------------------------------------------------------------+
+| \_ve  | string, indicates the version of magic-pdf used in this      |
+| rsion | parsing                                                      |
+| _name |                                                              |
+-------+--------------------------------------------------------------+
+**pdf_info**
+Field structure description
+---------+------------------------------------------------------------+
+| Field   | Description                                                |
+| Name    |                                                            |
+=========+============================================================+
+| preproc | Intermediate result after PDF preprocessing, not yet       |
+| _blocks | segmented                                                  |
+---------+------------------------------------------------------------+
+| layout  | Layout segmentation results, containing layout direction   |
+| _bboxes | (vertical, horizontal), and bbox, sorted by reading order  |
+---------+------------------------------------------------------------+
+| p       | Page number, starting from 0                               |
+| age_idx |                                                            |
+---------+------------------------------------------------------------+
+| pa      | Page width and height                                      |
+| ge_size |                                                            |
+---------+------------------------------------------------------------+
+| \_layo  | Layout tree structure                                      |
+| ut_tree |                                                            |
+---------+------------------------------------------------------------+
+| images  | list, each element is a dict representing an img_block     |
+---------+------------------------------------------------------------+
+| tables  | list, each element is a dict representing a table_block    |
+---------+------------------------------------------------------------+
+| inter   | list, each element is a dict representing an               |
+| line_eq | interline_equation_block                                   |
+| uations |                                                            |
+---------+------------------------------------------------------------+
+| di      | List, block information returned by the model that needs   |
+| scarded | to be dropped                                              |
+| _blocks |                                                            |
+---------+------------------------------------------------------------+
+| para    | Result after segmenting preproc_blocks                     |
+| _blocks |                                                            |
+---------+------------------------------------------------------------+
+In the above table, ``para_blocks`` is an array of dicts, each dict
+representing a block structure. A block can support up to one level of
+nesting.
+**block**
+The outer block is referred to as a first-level block, and the fields in
+the first-level block include:
+---------+-------------------------------------------------------------+
+| Field   | Description                                                 |
+| Name    |                                                             |
+=========+=============================================================+
+| type    | Block type (table|image)                                    |
+---------+-------------------------------------------------------------+
+| bbox    | Block bounding box coordinates                              |
+---------+-------------------------------------------------------------+
+| blocks  | list, each element is a dict representing a second-level    |
+|         | block                                                       |
+---------+-------------------------------------------------------------+
+There are only two types of first-level blocks: “table” and “image”. All
+other blocks are second-level blocks.
+The fields in a second-level block include:
+-----+----------------------------------------------------------------+
+| Fi  | Description                                                    |
+| eld |                                                                |
+| N   |                                                                |
+| ame |                                                                |
+=====+================================================================+
+| t   | Block type                                                     |
+| ype |                                                                |
+-----+----------------------------------------------------------------+
+| b   | Block bounding box coordinates                                 |
+| box |                                                                |
+-----+----------------------------------------------------------------+
+| li  | list, each element is a dict representing a line, used to      |
+| nes | describe the composition of a line of information              |
+-----+----------------------------------------------------------------+
+Detailed explanation of second-level block types
+================== ======================
+type               Description
+================== ======================
+image_body         Main body of the image
+image_caption      Image description text
+table_body         Main body of the table
+table_caption      Table description text
+table_footnote     Table footnote
+text               Text block
+title              Title block
+interline_equation Block formula
+================== ======================
+**line**
+The field format of a line is as follows:
+-----+----------------------------------------------------------------+
+| Fi  | Description                                                    |
+| eld |                                                                |
+| N   |                                                                |
+| ame |                                                                |
+=====+================================================================+
+| b   | Bounding box coordinates of the line                           |
+| box |                                                                |
+-----+----------------------------------------------------------------+
+| sp  | list, each element is a dict representing a span, used to      |
+| ans | describe the composition of the smallest unit                  |
+-----+----------------------------------------------------------------+
+**span**
+----------+-----------------------------------------------------------+
+| Field    | Description                                               |
+| Name     |                                                           |
+==========+===========================================================+
+| bbox     | Bounding box coordinates of the span                      |
+----------+-----------------------------------------------------------+
+| type     | Type of the span                                          |
+----------+-----------------------------------------------------------+
+| content  | Text spans use content, chart spans use img_path to store |
+| \|       | the actual text or screenshot path information            |
+| img_path |                                                           |
+----------+-----------------------------------------------------------+
+The types of spans are as follows:
+================== ==============
+type               Description
+================== ==============
+image              Image
+table              Table
+text               Text
+inline_equation    Inline formula
+interline_equation Block formula
+================== ==============
+**Summary**
+A span is the smallest storage unit for all elements.
+The elements stored within para_blocks are block information.
+The block structure is as follows:
+First-level block (if any) -> Second-level block -> Line -> Span
+.. _example-1:
+example
+^^^^^^^
+.. code:: json
+   {
+       "pdf_info": [
+           {
+               "preproc_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ],
+               "layout_bboxes": [
+                   {
+                       "layout_bbox": [
+                           52,
+                           61,
+                           294,
+                           731
+                       ],
+                       "layout_label": "V",
+                       "sub_layout": []
+                   }
+               ],
+               "page_idx": 0,
+               "page_size": [
+                   612.0,
+                   792.0
+               ],
+               "_layout_tree": [],
+               "images": [],
+               "tables": [],
+               "interline_equations": [],
+               "discarded_blocks": [],
+               "para_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ]
+           }
+       ],
+       "_parse_type": "txt",
+       "_version_name": "0.6.1"
+   }
+.. |Poly Coordinate Diagram| image:: ../../_static/image/poly.png
--- a/next_docs/requirements.txt
+++ b/next_docs/requirements.txt
@@ -5,7 +5,8 @@ Pillow==8.4.0
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
 sphinx
-sphinx-argparse
+sphinx-argparse>=0.5.2
-sphinx-book-theme
+sphinx-book-theme>=1.1.3
-sphinx-copybutton
+sphinx-copybutton>=0.5.2
-sphinx_rtd_theme
+sphinx_rtd_theme>=3.0.1
+autodoc_pydantic>=2.2.0
\ No newline at end of file
--- a/next_docs/zh_cn/.readthedocs.yaml
+++ b/next_docs/zh_cn/.readthedocs.yaml
@@ -10,7 +10,7 @@ formats:
 python:
  install:
-    - requirements: docs/requirements.txt
+    - requirements: next_docs/requirements.txt
 sphinx:
-  configuration: docs/zh_cn/conf.py
+  configuration: next_docs/zh_cn/conf.py
--- a/projects/web_demo/README.md
+++ b/projects/web_demo/README.md
@@ -56,5 +56,5 @@ python3 app.py or python app.py
 ps：API documentation
 ```
-Open the mineru-web API mineru-web接口文档.html in the browser
+https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
 ```
--- a/projects/web_demo/README_zh-CN.md
+++ b/projects/web_demo/README_zh-CN.md
@@ -55,5 +55,5 @@ python3 app.py 或者 python app.py
 ps：接口文档
 ```
-在浏览器打开 mineru-web接口文档.html
+https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
 ```
--- a/projects/web_demo/mineru-web接口文档.html
+++ b/projects/web_demo/mineru-web接口文档.html
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
+import json
+import os
+import requests
+from modelscope import snapshot_download
+def download_json(url):
+    # 下载JSON文件
+    response = requests.get(url)
+    response.raise_for_status()  # 检查请求是否成功
+    return response.json()
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+        config_version = data.get('config_version', '0.0.0')
+        if config_version < '1.0.0':
+            data = download_json(url)
+    else:
+        data = download_json(url)
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+if __name__ == '__main__':
+    mineru_patterns = [
+        "models/Layout/LayoutLMv3/*",
+        "models/Layout/YOLO/*",
+        "models/MFD/YOLO/*",
+        "models/MFR/unimernet_small/*",
+        "models/TabRec/TableMaster/*",
+        "models/TabRec/StructEqTable/*",
+    ]
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
+    layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
+    model_dir = model_dir + '/models'
+    print(f'model_dir is: {model_dir}')
+    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
+    json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
+    config_file_name = 'magic-pdf.json'
+    home_dir = os.path.expanduser('~')
+    config_file = os.path.join(home_dir, config_file_name)
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f'The configuration file has been configured successfully, the path is: {config_file}')
--- a/scripts/download_models_hf.py
+++ b/scripts/download_models_hf.py
+import json
+import os
+import requests
+from huggingface_hub import snapshot_download
+def download_json(url):
+    # 下载JSON文件
+    response = requests.get(url)
+    response.raise_for_status()  # 检查请求是否成功
+    return response.json()
+def download_and_modify_json(url, local_filename, modifications):
+    if os.path.exists(local_filename):
+        data = json.load(open(local_filename))
+        config_version = data.get('config_version', '0.0.0')
+        if config_version < '1.0.0':
+            data = download_json(url)
+    else:
+        data = download_json(url)
+    # 修改内容
+    for key, value in modifications.items():
+        data[key] = value
+    # 保存修改后的内容
+    with open(local_filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+if __name__ == '__main__':
+    mineru_patterns = [
+        "models/Layout/LayoutLMv3/*",
+        "models/Layout/YOLO/*",
+        "models/MFD/YOLO/*",
+        "models/MFR/unimernet_small/*",
+        "models/TabRec/TableMaster/*",
+        "models/TabRec/StructEqTable/*",
+    ]
+    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
+    layoutreader_pattern = [
+        "*.json",
+        "*.safetensors",
+    ]
+    layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
+    model_dir = model_dir + '/models'
+    print(f'model_dir is: {model_dir}')
+    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
+    json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
+    config_file_name = 'magic-pdf.json'
+    home_dir = os.path.expanduser('~')
+    config_file = os.path.join(home_dir, config_file_name)
+    json_mods = {
+        'models-dir': model_dir,
+        'layoutreader-model-dir': layoutreader_model_dir,
+    }
+    download_and_modify_json(json_url, config_file, json_mods)
+    print(f'The configuration file has been configured successfully, the path is: {config_file}')
--- a/setup.py
+++ b/setup.py
@@ -43,8 +43,9 @@ if __name__ == '__main__':
                     "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突，需锁定2.7.3
                     "paddlepaddle==3.0.0b1;platform_system=='Linux'",  # 解决linux的段异常问题
                     "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",  # windows版本3.0.0b1效率下降，需锁定2.6.1
-                     "pypandoc",  # 表格解析latex转html
+                     "struct-eqtable==0.3.2",  # 表格解析
-                     "struct-eqtable==0.1.0",  # 表格解析
+                     "einops",  # struct-eqtable依赖
+                     "accelerate",  # struct-eqtable依赖
                     "doclayout_yolo==0.0.2",  # doclayout_yolo
                     "detectron2"
                     ],

--- a/tests/test_data/data_reader_writer/test_multi_bucket_s3.py
+++ b/tests/test_data/data_reader_writer/test_multi_bucket_s3.py
@@ -41,8 +41,8 @@ def test_multi_bucket_s3_reader_writer():
        ),
    ]
-    reader = MultiBucketS3DataReader(default_bucket=bucket, s3_configs=s3configs)
+    reader = MultiBucketS3DataReader(bucket, s3configs)
-    writer = MultiBucketS3DataWriter(default_bucket=bucket, s3_configs=s3configs)
+    writer = MultiBucketS3DataWriter(bucket, s3configs)
    bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
@@ -80,3 +80,81 @@ def test_multi_bucket_s3_reader_writer():
    assert '123'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
    )
+@pytest.mark.skipif(
+    os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
+)
+def test_multi_bucket_s3_reader_writer_with_prefix():
+    """test multi bucket s3 reader writer must config s3 config in the
+    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
+    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
+    export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
+    """
+    bucket = os.getenv('S3_BUCKET', '')
+    ak = os.getenv('S3_ACCESS_KEY', '')
+    sk = os.getenv('S3_SECRET_KEY', '')
+    endpoint_url = os.getenv('S3_ENDPOINT', '')
+    bucket_2 = os.getenv('S3_BUCKET_2', '')
+    ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
+    sk_2 = os.getenv('S3_SECRET_KEY_2', '')
+    endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+    prefix = 'meta-index'
+    reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
+    writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
+    bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
+    assert bits == reader.read(
+        f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
+    )
+    bits = reader.read(
+        f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
+    )
+    docs = fitz.open('pdf', bits)
+    assert len(docs) == 10
+    bits = reader.read(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
+    )
+    assert bits == reader.read_at(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
+    )
+    assert len(json.loads(bits)) > 0
+    writer.write_string(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
+    )
+    assert 'abc'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+    assert 'abc'.encode() == reader.read(
+        f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+    writer.write(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
+        '123'.encode(),
+    )
+    assert '123'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
+    )
--- a/tests/test_data/data_reader_writer/test_s3.py
+++ b/tests/test_data/data_reader_writer/test_s3.py
@@ -9,7 +9,7 @@ from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
 @pytest.mark.skipif(
    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
 )
-def test_multi_bucket_s3_reader_writer():
+def test_s3_reader_writer():
    """test multi bucket s3 reader writer must config s3 config in the
    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
@@ -18,8 +18,8 @@ def test_multi_bucket_s3_reader_writer():
    sk = os.getenv('S3_SECRET_KEY', '')
    endpoint_url = os.getenv('S3_ENDPOINT', '')
-    reader = S3DataReader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
+    reader = S3DataReader('', bucket, ak, sk, endpoint_url)
-    writer = S3DataWriter(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
+    writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
    bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
@@ -51,3 +51,56 @@ def test_multi_bucket_s3_reader_writer():
    assert '123'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
    )
+@pytest.mark.skipif(
+    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
+)
+def test_s3_reader_writer_with_prefix():
+    """test multi bucket s3 reader writer must config s3 config in the
+    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
+    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
+    bucket = os.getenv('S3_BUCKET', '')
+    ak = os.getenv('S3_ACCESS_KEY', '')
+    sk = os.getenv('S3_SECRET_KEY', '')
+    endpoint_url = os.getenv('S3_ENDPOINT', '')
+    prefix = 'meta-index'
+    reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
+    writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
+    bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
+    assert bits == reader.read(
+        f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
+    )
+    bits = reader.read(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
+    )
+    assert bits == reader.read_at(
+        'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
+    )
+    assert len(json.loads(bits)) > 0
+    writer.write_string(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
+    )
+    assert 'abc'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+    assert 'abc'.encode() == reader.read(
+        f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
+    )
+    writer.write(
+        f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
+        '123'.encode(),
+    )
+    assert '123'.encode() == reader.read(
+        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
+    )
--- a/tests/test_table/test_tablemaster.py
+++ b/tests/test_table/test_tablemaster.py
 import unittest
 from PIL import Image
+from lxml import etree
 from magic_pdf.model.ppTableModel import ppTableModel
 class TestppTableModel(unittest.TestCase):
@@ -10,8 +12,44 @@ class TestppTableModel(unittest.TestCase):
                  "model_dir": "D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"}
        table_model = ppTableModel(config)
        res = table_model.img2html(img)
-        true_value = """<td><table  border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""
+        # 验证生成的 HTML 是否符合预期
-        self.assertEqual(true_value, res)
+        parser = etree.HTMLParser()
+        tree = etree.fromstring(res, parser)
+        # 检查 HTML 结构
+        assert tree.find('.//table') is not None, "HTML should contain a <table> element"
+        assert tree.find('.//thead') is not None, "HTML should contain a <thead> element"
+        assert tree.find('.//tbody') is not None, "HTML should contain a <tbody> element"
+        assert tree.find('.//tr') is not None, "HTML should contain a <tr> element"
+        assert tree.find('.//td') is not None, "HTML should contain a <td> element"
+        # 检查具体的表格内容
+        headers = tree.xpath('//thead/tr/td/b')
+        print(headers)  # Print headers for debugging
+        assert len(headers) == 5, "Thead should have 5 columns"
+        assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'"
+        assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'"
+        assert headers[2].text and headers[2].text.strip() == "P", "Third header should be 'P'"
+        assert headers[3].text and headers[3].text.strip() == "F", "Fourth header should be 'F'"
+        assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'"
+        # 检查第一行数据
+        first_row = tree.xpath('//tbody/tr[1]/td')
+        assert len(first_row) == 5, "First row should have 5 cells"
+        assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
+        assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
+        assert first_row[2].text and first_row[2].text.strip() == "86.0", "Third cell should be '86.0'"
+        assert first_row[3].text and first_row[3].text.strip() == "77.0", "Fourth cell should be '77.0'"
+        assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'"
+        # 检查倒数第二行数据
+        second_last_row = tree.xpath('//tbody/tr[position()=last()-1]/td')
+        assert len(second_last_row) == 5, "second_last_row should have 5 cells"
+        assert second_last_row[0].text and second_last_row[0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
+        assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'"
+        assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'"
+        assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
+        assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
 if __name__ == "__main__":