".github/vscode:/vscode.git/clone" did not exist on "2673fa29d4b9188463898da7d1234a85a1685377"
Unverified Commit 069bcfe6 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #879 from opendatalab/release-0.9.1

Release 0.9.1
parents 8ee1da82 bff7bd93
Extract Content from Pdf
========================
.. code:: python
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
Convert To Markdown
========================
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
## args
model_list = []
pdf_file_name = "abc.pdf" # replace with the real pdf path
## prepare env
local_image_dir, local_md_dir = "output/images", "output"
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
) # create 00
image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown(
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
if isinstance(md_content, list):
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
Tutorial
===========
From the beginning to the end, Show how to using mineru via a minimal project
.. toctree::
:maxdepth: 1
tutorial/output_file_description
\ No newline at end of file
Output File Description
=========================
After executing the ``magic-pdf`` command, in addition to outputting
files related to markdown, several other files unrelated to markdown
will also be generated. These files will be introduced one by one.
some_pdf_layout.pdf
~~~~~~~~~~~~~~~~~~~
Each page layout consists of one or more boxes. The number at the top
left of each box indicates its sequence number. Additionally, in
``layout.pdf``, different content blocks are highlighted with different
background colors.
.. figure:: ../../_static/image/layout_example.png
:alt: layout example
layout example
some_pdf_spans.pdf
~~~~~~~~~~~~~~~~~~
All spans on the page are drawn with different colored line frames
according to the span type. This file can be used for quality control,
allowing for quick identification of issues such as missing text or
unrecognized inline formulas.
.. figure:: ../../_static/image/spans_example.png
:alt: spans example
spans example
some_pdf_model.json
~~~~~~~~~~~~~~~~~~~
Structure Definition
^^^^^^^^^^^^^^^^^^^^
.. code:: python
from pydantic import BaseModel, Field
from enum import IntEnum
class CategoryType(IntEnum):
title = 0 # Title
plain_text = 1 # Text
abandon = 2 # Includes headers, footers, page numbers, and page annotations
figure = 3 # Image
figure_caption = 4 # Image description
table = 5 # Table
table_caption = 6 # Table description
table_footnote = 7 # Table footnote
isolate_formula = 8 # Block formula
formula_caption = 9 # Formula label
embedding = 13 # Inline formula
isolated = 14 # Block formula
text = 15 # OCR recognition result
class PageInfo(BaseModel):
page_no: int = Field(description="Page number, the first page is 0", ge=0)
height: int = Field(description="Page height", gt=0)
width: int = Field(description="Page width", ge=0)
class ObjectInferenceResult(BaseModel):
category_id: CategoryType = Field(description="Category", ge=0)
poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
score: float = Field(description="Confidence of the inference result")
latex: str | None = Field(description="LaTeX parsing result", default=None)
html: str | None = Field(description="HTML parsing result", default=None)
class PageInferenceResults(BaseModel):
layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
page_info: PageInfo = Field(description="Page metadata")
# The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
inference_result: list[PageInferenceResults] = []
The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
representing the coordinates of the top-left, top-right, bottom-right,
and bottom-left points respectively. |Poly Coordinate Diagram|
example
^^^^^^^
.. code:: json
[
{
"layout_dets": [
{
"category_id": 2,
"poly": [
99.1906967163086,
100.3119125366211,
730.3707885742188,
100.3119125366211,
730.3707885742188,
245.81326293945312,
99.1906967163086,
245.81326293945312
],
"score": 0.9999997615814209
}
],
"page_info": {
"page_no": 0,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 5,
"poly": [
99.13092803955078,
2210.680419921875,
497.3183898925781,
2210.680419921875,
497.3183898925781,
2264.78076171875,
99.13092803955078,
2264.78076171875
],
"score": 0.9999997019767761
}
],
"page_info": {
"page_no": 1,
"height": 2339,
"width": 1654
}
}
]
some_pdf_middle.json
~~~~~~~~~~~~~~~~~~~~
+-------+--------------------------------------------------------------+
| Field | Description |
| Name | |
+=======+==============================================================+
| pdf | list, each element is a dict representing the parsing result |
| _info | of each PDF page, see the table below for details |
+-------+--------------------------------------------------------------+
| \_ | ocr \| txt, used to indicate the mode used in this |
| parse | intermediate parsing state |
| _type | |
+-------+--------------------------------------------------------------+
| \_ve | string, indicates the version of magic-pdf used in this |
| rsion | parsing |
| _name | |
+-------+--------------------------------------------------------------+
**pdf_info**
Field structure description
+---------+------------------------------------------------------------+
| Field | Description |
| Name | |
+=========+============================================================+
| preproc | Intermediate result after PDF preprocessing, not yet |
| _blocks | segmented |
+---------+------------------------------------------------------------+
| layout | Layout segmentation results, containing layout direction |
| _bboxes | (vertical, horizontal), and bbox, sorted by reading order |
+---------+------------------------------------------------------------+
| p | Page number, starting from 0 |
| age_idx | |
+---------+------------------------------------------------------------+
| pa | Page width and height |
| ge_size | |
+---------+------------------------------------------------------------+
| \_layo | Layout tree structure |
| ut_tree | |
+---------+------------------------------------------------------------+
| images | list, each element is a dict representing an img_block |
+---------+------------------------------------------------------------+
| tables | list, each element is a dict representing a table_block |
+---------+------------------------------------------------------------+
| inter | list, each element is a dict representing an |
| line_eq | interline_equation_block |
| uations | |
+---------+------------------------------------------------------------+
| di | List, block information returned by the model that needs |
| scarded | to be dropped |
| _blocks | |
+---------+------------------------------------------------------------+
| para | Result after segmenting preproc_blocks |
| _blocks | |
+---------+------------------------------------------------------------+
In the above table, ``para_blocks`` is an array of dicts, each dict
representing a block structure. A block can support up to one level of
nesting.
**block**
The outer block is referred to as a first-level block, and the fields in
the first-level block include:
+---------+-------------------------------------------------------------+
| Field | Description |
| Name | |
+=========+=============================================================+
| type | Block type (table|image) |
+---------+-------------------------------------------------------------+
| bbox | Block bounding box coordinates |
+---------+-------------------------------------------------------------+
| blocks | list, each element is a dict representing a second-level |
| | block |
+---------+-------------------------------------------------------------+
There are only two types of first-level blocks: “table” and “image”. All
other blocks are second-level blocks.
The fields in a second-level block include:
+-----+----------------------------------------------------------------+
| Fi | Description |
| eld | |
| N | |
| ame | |
+=====+================================================================+
| t | Block type |
| ype | |
+-----+----------------------------------------------------------------+
| b | Block bounding box coordinates |
| box | |
+-----+----------------------------------------------------------------+
| li | list, each element is a dict representing a line, used to |
| nes | describe the composition of a line of information |
+-----+----------------------------------------------------------------+
Detailed explanation of second-level block types
================== ======================
type Description
================== ======================
image_body Main body of the image
image_caption Image description text
table_body Main body of the table
table_caption Table description text
table_footnote Table footnote
text Text block
title Title block
interline_equation Block formula
================== ======================
**line**
The field format of a line is as follows:
+-----+----------------------------------------------------------------+
| Fi | Description |
| eld | |
| N | |
| ame | |
+=====+================================================================+
| b | Bounding box coordinates of the line |
| box | |
+-----+----------------------------------------------------------------+
| sp | list, each element is a dict representing a span, used to |
| ans | describe the composition of the smallest unit |
+-----+----------------------------------------------------------------+
**span**
+----------+-----------------------------------------------------------+
| Field | Description |
| Name | |
+==========+===========================================================+
| bbox | Bounding box coordinates of the span |
+----------+-----------------------------------------------------------+
| type | Type of the span |
+----------+-----------------------------------------------------------+
| content | Text spans use content, chart spans use img_path to store |
| \| | the actual text or screenshot path information |
| img_path | |
+----------+-----------------------------------------------------------+
The types of spans are as follows:
================== ==============
type Description
================== ==============
image Image
table Table
text Text
inline_equation Inline formula
interline_equation Block formula
================== ==============
**Summary**
A span is the smallest storage unit for all elements.
The elements stored within para_blocks are block information.
The block structure is as follows:
First-level block (if any) -> Second-level block -> Line -> Span
.. _example-1:
example
^^^^^^^
.. code:: json
{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
],
"layout_bboxes": [
{
"layout_bbox": [
52,
61,
294,
731
],
"layout_label": "V",
"sub_layout": []
}
],
"page_idx": 0,
"page_size": [
612.0,
792.0
],
"_layout_tree": [],
"images": [],
"tables": [],
"interline_equations": [],
"discarded_blocks": [],
"para_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
]
}
],
"_parse_type": "txt",
"_version_name": "0.6.1"
}
.. |Poly Coordinate Diagram| image:: ../../_static/image/poly.png
......@@ -5,7 +5,8 @@ Pillow==8.4.0
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
sphinx
sphinx-argparse
sphinx-book-theme
sphinx-copybutton
sphinx_rtd_theme
sphinx-argparse>=0.5.2
sphinx-book-theme>=1.1.3
sphinx-copybutton>=0.5.2
sphinx_rtd_theme>=3.0.1
autodoc_pydantic>=2.2.0
\ No newline at end of file
......@@ -10,7 +10,7 @@ formats:
python:
install:
- requirements: docs/requirements.txt
- requirements: next_docs/requirements.txt
sphinx:
configuration: docs/zh_cn/conf.py
configuration: next_docs/zh_cn/conf.py
......@@ -56,5 +56,5 @@ python3 app.py or python app.py
ps:API documentation
```
Open the mineru-web API mineru-web接口文档.html in the browser
https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
```
......@@ -55,5 +55,5 @@ python3 app.py 或者 python app.py
ps:接口文档
```
在浏览器打开 mineru-web接口文档.html
https://apifox.com/apidoc/shared-b8eda098-ab9c-4cb3-9432-62be9be9c6f7
```
This diff is collapsed.
import json
import os
import requests
from modelscope import snapshot_download
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
"models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_small/*",
"models/TabRec/TableMaster/*",
"models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
import json
import os
import requests
from huggingface_hub import snapshot_download
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
"models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_small/*",
"models/TabRec/TableMaster/*",
"models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_pattern = [
"*.json",
"*.safetensors",
]
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
......@@ -43,8 +43,9 @@ if __name__ == '__main__':
"paddleocr==2.7.3", # 2.8.0及2.8.1版本与detectron2有冲突,需锁定2.7.3
"paddlepaddle==3.0.0b1;platform_system=='Linux'", # 解决linux的段异常问题
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'", # windows版本3.0.0b1效率下降,需锁定2.6.1
"pypandoc", # 表格解析latex转html
"struct-eqtable==0.1.0", # 表格解析
"struct-eqtable==0.3.2", # 表格解析
"einops", # struct-eqtable依赖
"accelerate", # struct-eqtable依赖
"doclayout_yolo==0.0.2", # doclayout_yolo
"detectron2"
],
......
......@@ -41,8 +41,8 @@ def test_multi_bucket_s3_reader_writer():
),
]
reader = MultiBucketS3DataReader(default_bucket=bucket, s3_configs=s3configs)
writer = MultiBucketS3DataWriter(default_bucket=bucket, s3_configs=s3configs)
reader = MultiBucketS3DataReader(bucket, s3configs)
writer = MultiBucketS3DataWriter(bucket, s3configs)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
......@@ -80,3 +80,81 @@ def test_multi_bucket_s3_reader_writer():
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
prefix = 'meta-index'
reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
......@@ -9,7 +9,7 @@ from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
def test_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
......@@ -18,8 +18,8 @@ def test_multi_bucket_s3_reader_writer():
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3DataReader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
writer = S3DataWriter(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
reader = S3DataReader('', bucket, ak, sk, endpoint_url)
writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
......@@ -51,3 +51,56 @@ def test_multi_bucket_s3_reader_writer():
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
prefix = 'meta-index'
reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import unittest
from PIL import Image
from lxml import etree
from magic_pdf.model.ppTableModel import ppTableModel
class TestppTableModel(unittest.TestCase):
......@@ -10,8 +12,44 @@ class TestppTableModel(unittest.TestCase):
"model_dir": "D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"}
table_model = ppTableModel(config)
res = table_model.img2html(img)
true_value = """<td><table border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""
self.assertEqual(true_value, res)
# 验证生成的 HTML 是否符合预期
parser = etree.HTMLParser()
tree = etree.fromstring(res, parser)
# 检查 HTML 结构
assert tree.find('.//table') is not None, "HTML should contain a <table> element"
assert tree.find('.//thead') is not None, "HTML should contain a <thead> element"
assert tree.find('.//tbody') is not None, "HTML should contain a <tbody> element"
assert tree.find('.//tr') is not None, "HTML should contain a <tr> element"
assert tree.find('.//td') is not None, "HTML should contain a <td> element"
# 检查具体的表格内容
headers = tree.xpath('//thead/tr/td/b')
print(headers) # Print headers for debugging
assert len(headers) == 5, "Thead should have 5 columns"
assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'"
assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'"
assert headers[2].text and headers[2].text.strip() == "P", "Third header should be 'P'"
assert headers[3].text and headers[3].text.strip() == "F", "Fourth header should be 'F'"
assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'"
# 检查第一行数据
first_row = tree.xpath('//tbody/tr[1]/td')
assert len(first_row) == 5, "First row should have 5 cells"
assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
assert first_row[2].text and first_row[2].text.strip() == "86.0", "Third cell should be '86.0'"
assert first_row[3].text and first_row[3].text.strip() == "77.0", "Fourth cell should be '77.0'"
assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'"
# 检查倒数第二行数据
second_last_row = tree.xpath('//tbody/tr[position()=last()-1]/td')
assert len(second_last_row) == 5, "second_last_row should have 5 cells"
assert second_last_row[0].text and second_last_row[0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'"
assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'"
assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment