Unverified Commit 4bb54393 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1427 from opendatalab/release-1.0.0

Release 1.0.0
parents 04f084ac 1c9f9942
......@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
---------
.. code:: python
from magic_pdf.data.data_reader_writer import *
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# 文件相关的
# 初始化 reader
file_based_reader1 = FileBasedDataReader('')
## 将读取文件 abc
file_based_reader1.read('abc')
## 读本地文件 abc
file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp')
## 将读取 /tmp/abc
## 读本地文件 /tmp/abc
file_based_reader2.read('abc')
## 将读取 /var/logs/message.txt
file_based_reader2.read('/var/logs/message.txt')
## 读本地文件 /tmp/logs/message.txt
file_based_reader2.read('/tmp/logs/message.txt')
# 初始化多桶 s3 reader
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
# 多桶 S3 相关的
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## 将读取 s3://test_bucket1/test_prefix/abc
## 读文件 s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_reader1.read('abc')
## 将读取 s3://test_bucket1/efg
multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
## 读文件 s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
## 将读取 s3://test_bucket2/abc
multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
## 读文件 s3://{bucket2}/{test_prefix}/abc
multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
# S3 相关的
# 初始化 s3 reader
s3_reader1 = S3DataReader(
default_prefix_without_bucket = "test_prefix",
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
test_prefix,
bucket,
ak,
sk,
endpoint_url
)
## 将读取 s3://test_bucket/test_prefix/abc
## 读文件 s3://{bucket}/{test_prefix}/abc
s3_reader1.read('abc')
## 将读取 s3://test_bucket/efg
s3_reader1.read('s3://test_bucket/efg')
## 读文件 s3://{bucket}/efg
s3_reader1.read(f's3://{bucket}/efg')
写入示例
----------
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# 初始化 reader
file_based_writer1 = FileBasedDataWriter("")
## 写数据 123 to abc
file_based_writer1.write("abc", "123".encode())
## 写数据 123 to abc
file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter("/tmp")
## 写数据 123 to /tmp/abc
file_based_writer2.write_string("abc", "123")
## 写数据 123 to /tmp/logs/message.txt
file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# 初始化多桶 s3 writer
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
# 文件相关的
file_based_writer1 = FileBasedDataWriter('')
## 将写入 123 到 abc
file_based_writer1.write('abc', '123'.encode())
## 将写入 123 到 abc
file_based_writer1.write_string('abc', '123')
file_based_writer2 = FileBasedDataWriter('/tmp')
## 将写入 123 到 /tmp/abc
file_based_writer2.write_string('abc', '123')
## 将写入 123 到 /var/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123')
# 多桶 S3 相关的
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## 将写入 123 到 s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write_string("abc", "123")
## 将写入 123 s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write('abc', '123'.encode())
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write("abc", "123".encode())
## 将写入 123 s3://test_bucket1/efg
multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
## 写数据 123 to s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
## 将写入 123 s3://test_bucket2/abc
multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
## 写数据 123 to s3://{bucket_2}/{test_prefix}/abc
multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
# S3 相关的
s3_writer1 = S3DataWriter(
default_prefix_without_bucket = "test_prefix",
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
)
# 初始化 s3 writer
s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
## 将写入 123 s3://test_bucket/test_prefix/abc
s3_writer1.write('abc', '123'.encode())
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write("abc", "123".encode())
## 将写入 123 s3://test_bucket/test_prefix/abc
s3_writer1.write_string('abc', '123')
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write_string("abc", "123")
## 将写入 123 s3://test_bucket/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode())
## 写数据 123 to s3://{bucket}/efg
s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
......@@ -15,13 +15,41 @@ read_jsonl
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# 从本地机器读取 JSONL
datasets = read_jsonl("tt.jsonl", None)
# 读取本地 jsonl 文件
datasets = read_jsonl("tt.jsonl", None) # 替换为有效的文件
# 读取 s3 jsonl 文件
bucket = "bucket_1" # 替换为有效的 s3 bucket
ak = "access_key_1" # 替换为有效的 s3 access key
sk = "secret_key_1" # 替换为有效的 s3 secret key
endpoint_url = "endpoint_url_1" # 替换为有效的 s3 endpoint url
bucket_2 = "bucket_2" # 替换为有效的 s3 bucket
ak_2 = "access_key_2" # 替换为有效的 s3 access key
sk_2 = "secret_key_2" # 替换为有效的 s3 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 s3 endpoint url
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
s3_reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # 替换为有效的 s3 jsonl file
# 从远程 S3 读取 JSONL
datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
read_local_pdfs
^^^^^^^^^^^^^^^^
......@@ -30,13 +58,13 @@ read_local_pdfs
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# 读取 PDF 路径
datasets = read_local_pdfs("tt.pdf")
datasets = read_local_pdfs("tt.pdf") # 替换为有效的文件
# 读取目录下的 PDF 文件
datasets = read_local_pdfs("pdfs/")
datasets = read_local_pdfs("pdfs/") # 替换为有效的文件目录
read_local_images
^^^^^^^^^^^^^^^^^^^
......@@ -45,10 +73,10 @@ read_local_images
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# 从图像路径读取
datasets = read_local_images("tt.png")
datasets = read_local_images("tt.png") # 替换为有效的文件
# 从目录读取以 suffixes 数组中指定后缀结尾的文件
datasets = read_local_images("images/", suffixes=["png", "jpg"])
datasets = read_local_images("images/", suffixes=["png", "jpg"]) # 替换为有效的文件目录
......@@ -97,6 +97,7 @@ def replace_image_with_base64(markdown_text, image_dir_path):
def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
file_path = to_pdf(file_path)
# 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
layout_mode, formula_enable, table_enable, language)
......@@ -159,7 +160,7 @@ devanagari_lang = [
]
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
all_lang = ['']
all_lang = ['', 'auto']
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
......@@ -189,37 +190,37 @@ if __name__ == '__main__':
with gr.Row():
with gr.Column(variant='panel', scale=5):
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
max_pages = gr.Slider(1, 10, 5, step=1, label='Max convert pages')
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Row():
layout_mode = gr.Dropdown(['layoutlmv3', 'doclayout_yolo'], label='Layout model', value='layoutlmv3')
language = gr.Dropdown(all_lang, label='Language', value='')
layout_mode = gr.Dropdown(['layoutlmv3', 'doclayout_yolo'], label='Layout model', value='doclayout_yolo')
language = gr.Dropdown(all_lang, label='Language', value='auto')
with gr.Row():
formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
table_enable = gr.Checkbox(label='Enable table recognition(test)', value=False)
table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
with gr.Row():
change_bu = gr.Button('Convert')
clear_bu = gr.ClearButton(value='Clear')
pdf_show = PDF(label='PDF preview', interactive=True, height=800)
pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800)
with gr.Accordion('Examples:'):
example_root = os.path.join(os.path.dirname(__file__), 'examples')
gr.Examples(
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
_.endswith('pdf')],
inputs=pdf_show
inputs=file
)
with gr.Column(variant='panel', scale=5):
output_file = gr.File(label='convert result', interactive=False)
with gr.Tabs():
with gr.Tab('Markdown rendering'):
md = gr.Markdown(label='Markdown rendering', height=900, show_copy_button=True,
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True)
with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True)
file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
demo.launch(server_name='0.0.0.0')
No preview for this file type
......@@ -102,13 +102,24 @@
<!-- Homepage Link. -->
<span class="link-block">
<a href="https://opendatalab.com/" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<a href="https://mineru.org.cn/home?source=online" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 8px">
<i class="fas fa-globe" style="color: white"></i>
<i class="fas fa-home" style="color: white"></i>
</span>
<span style="color: white">Homepage</span>
</a>
</span>
<!-- Client Link. -->
<span class="link-block">
<a href="https://mineru.org.cn/client?source=online" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 8px">
<i class="fas fa-download" style="color: white"></i>
</span>
<span style="color: white">Download</span>
</a>
</span>
</div>
</div>
......
......@@ -9,10 +9,11 @@ from fastapi.responses import JSONResponse
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult
model_config.__use_inside_model__ = True
......@@ -20,14 +21,15 @@ app = FastAPI()
def json_md_dump(
pipe,
model_json,
middle_json,
md_writer,
pdf_name,
content_list,
md_content,
):
# Write model results to model.json
orig_model_list = copy.deepcopy(pipe.model_list)
orig_model_list = copy.deepcopy(model_json)
md_writer.write_string(
f'{pdf_name}_model.json',
json.dumps(orig_model_list, ensure_ascii=False, indent=4),
......@@ -36,7 +38,7 @@ def json_md_dump(
# Write intermediate results to middle.json
md_writer.write_string(
f'{pdf_name}_middle.json',
json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
# Write text content results to content_list.json
......@@ -100,45 +102,49 @@ async def pdf_parse_main(
output_image_path
), FileBasedDataWriter(output_path)
ds = PymuDocDataset(pdf_bytes)
# Choose parsing method
if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_json, image_writer)
elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_json, image_writer)
else:
if ds.classify() == SupportedPdfParseMethod.OCR:
parse_method = 'ocr'
else:
parse_method = 'txt'
if parse_method not in ['txt', 'ocr']:
logger.error('Unknown parse method, only auto, ocr, txt allowed')
return JSONResponse(
content={'error': 'Invalid parse method'}, status_code=400
)
# Execute classification
pipe.pipe_classify()
# If no model data is provided, use built-in model for parsing
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # Parse
if len(model_json) == 0:
if parse_method == 'ocr':
infer_result = ds.apply(doc_analyze, ocr=True)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
else:
infer_result = InferenceResult(model_json, ds)
if len(model_json) == 0 and not model_config.__use_inside_model__:
logger.error('Need model list input')
return JSONResponse(
content={'error': 'Model list input required'}, status_code=400
)
if parse_method == 'ocr':
pipe_res = infer_result.pipe_ocr_mode(image_writer)
else:
pipe_res = infer_result.pipe_txt_mode(image_writer)
# Execute parsing
pipe.pipe_parse()
# Save results in text and md format
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode='none')
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode='none')
content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content)
data = {
'layout': copy.deepcopy(pipe.model_list),
'info': pipe.pdf_mid_data,
'layout': copy.deepcopy(infer_result._infer_res),
'info': pipe_res._pipe_res,
'content_list': content_list,
'md_content': md_content,
}
......
......@@ -11,9 +11,12 @@ from flask import current_app, url_for
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.operators import InferenceResult
from ..extentions import app, db
from .ext import find_file
......@@ -25,25 +28,28 @@ model_config.__use_inside_model__ = True
def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
try:
model_json = [] # model_json传空list使用内置模型解析
image_writer = FileBasedDataWriter(image_dir)
logger.info(f'is_ocr: {is_ocr}')
parse_method = 'ocr'
ds = PymuDocDataset(pdf_bytes)
# Choose parsing method
if not is_ocr:
jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
image_writer = FileBasedDataWriter(image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
pipe.pipe_classify()
else:
jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
image_writer = FileBasedDataWriter(image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
"""如果没有传入有效的模型数据,则使用内置model解析"""
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
if ds.classify() == SupportedPdfParseMethod.OCR:
parse_method = 'ocr'
else:
logger.error('need model list input')
exit(1)
pipe.pipe_parse()
pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
parse_method = 'txt'
if parse_method == 'ocr':
infer_result = ds.apply(doc_analyze, ocr=True)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
if parse_method == 'ocr':
pipe_res = infer_result.pipe_ocr_mode(image_writer)
else:
pipe_res = infer_result.pipe_txt_mode(image_writer)
pdf_mid_data = pipe_res._pipe_res
pdf_info_list = pdf_mid_data['pdf_info']
md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
ensure_ascii=False)
......@@ -52,7 +58,6 @@ def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
except Exception as e: # noqa: F841
logger.error(traceback.format_exc())
def get_bbox_info(data):
bbox_info = []
for page in data:
......
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
fast-langdetect==0.2.0
fast-langdetect>=0.2.3
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
pydantic>=2.7.2,<2.8.0
pydantic>=2.7.2
PyMuPDF>=1.24.9
scikit-learn>=1.0.2
torch>=2.2.2
......
......@@ -16,7 +16,7 @@ def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
if config_version < '1.1.0':
data = download_json(url)
else:
data = download_json(url)
......
......@@ -16,7 +16,7 @@ def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
if config_version < '1.1.0':
data = download_json(url)
else:
data = download_json(url)
......
......@@ -36,7 +36,7 @@ if __name__ == '__main__':
"paddlepaddle==3.0.0b1;platform_system=='Linux'",
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
],
"full": ["unimernet==0.2.2", # unimernet升级0.2.2,移除torchtext的依赖
"full": ["unimernet==0.2.3", # unimernet升级0.2.3,移除torchtext/eva-decord的依赖
"torch>=2.2.2,<=2.3.1", # torch2.4.0及之后版本未测试,先卡住版本上限
"torchvision>=0.17.2,<=0.18.1", # torchvision 受torch版本约束
"matplotlib<=3.9.0;platform_system=='Windows'", # 3.9.1及之后不提供windows的预编译包,避免一些没有编译环境的windows设备安装失败
......@@ -50,8 +50,10 @@ if __name__ == '__main__':
"accelerate", # struct-eqtable依赖
"doclayout_yolo==0.0.2", # doclayout_yolo
"rapidocr-paddle", # rapidocr-paddle
"rapid_table", # rapid_table
"rapidocr_onnxruntime",
"rapid_table==0.3.0", # rapid_table
"PyYAML", # yaml
"openai", # openai SDK
"detectron2"
],
"old_linux":[
......
......@@ -6,7 +6,8 @@ retry_count=0
while true; do
# prepare env
#python -m pip install -r requirements-qa.txt
python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
#python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
pip install -e .
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
pip install modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py
......
......@@ -2,7 +2,9 @@ import os
conf = {
"code_path": os.environ.get('GITHUB_WORKSPACE'),
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
#"code_path": "/home/quyuan/ci/actions-runner/MinerU",
#"pdf_dev_path": "/home/quyuan/ci/actions-runner/MinerU/tests/test_cli/pdf_dev",
"pdf_res_path": "/tmp/magic-pdf",
"jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl",
"s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test_rearch_report.pdf"
}
\ No newline at end of file
}
## 增持(维持)
所属行业:机械设备
当前价格(元): 82.42
## 证券分析师
倪正洋
资格编号:S0120521020003
邮箱: nizy@tebon.com.cn
## 研究助理
杨云道
邮箱: yangyx@tebon.com.cn
| 沪深 300 对比 | $1 \mathrm{M}$ | $2 \mathrm{M}$ | $3 \mathrm{M}$ |
| :--- | ---: | ---: | ---: |
| 绝对涨幅(\%) | 7.18 | 32.88 | 80.86 |
| 相对涨幅(\%) | 8.10 | 25.93 | 78.39 |
资料来源: 德邦研究所, 聚源数据
## 相关研究
1.《高测股份 (688556): 光伏金刚线及硅片切割代工业务推动公司 22Q1 业绩大超预期》, 2022.4.29
2.《光伏设备: 光伏高效电池扩产提速,关键设备商各领风骚》, 2022.4.10 3. 《高测股份 (688556.SH): 再签建湖 10GW 硅片切割代工产能,强化代工业务成长逻辑》, 2022.4.7
3.《高测股份 (688556.SH): 签订晶澳曲靖 2.2 亿元切割设备合同,看好 22 年代工业绩释放+HJT 切割工艺进步》, 2022.3.9
4.《高测股份 (688556.SH): 21 年业绩预告超市场预期,关注切片代工利润释放》, 2022.1.24
# 高测股份 $(688556.5 H):$ 扩产 4000 万公里金刚线,强化光伏碰片切割三元布局
## 投资要点
- 事件:公司拟与蓝关县人民政府签署的《壶关年产 12000 万千米金刚线项目投资协议书》,项目一期计划建设年产 4,000万千米金刚线产能,预计一期总投资额约 6.66 亿元; 后续年产 8,000 万千米金刚线项目尚未具体约定,存在较大不确定性。
- 顺应下游需求扩张, 金刚线产能快速扩产, 保证公司内供+外销。光伏金刚线需求 22 年提升源于两方面:1)2022 年光伏产业链景气度高涨,1-5 月光伏装机同比 $+24.4 \%$, 带动产业链各环节开工率提升, 硅片前期扩产产能逐步落地, 金刚线需求释放;2)由于多晶硅料价格持续维持高位,细线化、薄片化趋势加速,其中细线化要求金刚线线径由 40 线、 38 线向 36 线、 35 线进步, 带动单 GW 切割线耗不断提升。目前 36 线单 GW 切割线耗约 50 万公里, 较 38 线提升约 $30 \%$ 。公司于 2021 年对金刚线进行 “ 1 机 12 线” 技改,技改完成后,公司 22 年 1 季度产能 712 万公里, 年化产能超 2500 万公里。公司目前切片代工产能约 47GW, 对应远期金刚线产能超 2300 万公里。本次扩产再一次扩充公司金刚线产能, 强化金刚线产能内供+外销布局。
- 依托萦关低成本电价提升金刚线盈利能力, 顺应硅料节约持续推动细线化布局。公司在山西长治金刚线生产厂区采购电力的平均单价较青岛金刚线生产厂区采购电力的平均单价低, 2020 年度公司陆续将青岛的金刚线生产线搬迁到山西长治並关厂区,随着山西长治金刚线生产厂区金刚线产量增加,公司采购电力的平均单价呈下降趋势。目前公司电力采购单价从 2019 年 0.8 元/kwh 降低到 2022 年 Q1 的 0.39 元/kwh,並关后续拓展有望进一步降低公司金刚线电价成本。金刚线线径越细,锯㖓越小,切割时产生的锯㖓硅料损失越少,同样一根硅棒可切割加工出的硅片数量越多,制造硅片所需的硅材料越少。相同切割工艺下,金刚线越细,固结在钢线基体上的金刚石微粉颗粒越小,切割加工时对硅片的表面损伤越小,硅片表面质量越好,砝片 TTV 等质量指标表现也就越好。金刚线母线直径已由 2016 年的 80um 降至 2022 年上半年的 36、38、40um,此外高线速、柔性化和智能化等均是金刚线及切片技术进步方向, 公司在薄片、细线化、高线速、柔性智能化方面均有领先布局, 推动切割工艺持续进步。
- 切割工艺的持续进步领先, 是保障公司利润释放的核心壁垒。公司光伏硅片切割三元布局包括硅片切割及机加工设备、砝片切割耗材 (金刚线) 以及切割代工业务。公司 2021 年依托前期设备+耗材布局切割代工业务, 目前已公布 47GW 产能 (乐山5GW 示范基地、乐山 20GW 大硅片及配套项目、建湖一期 10GW 项目,建湖二期 $12 \mathrm{GW}$ 项目), 客户包括通威、京运通、美科及建湖周边电池企业。22 年底公司有望实现超 20GW 切割代工产能, 且当前终端客户主要为下游电池企业。客户选择切割代工模式的核心在于凭借高测的专业化服务实现快速上产, 同时可获得较自建硅片切割产能或购买硅片更多的超额利润。超额利润的核心在于高测股份的切割代工技术领先, 可实现更多的硅片切割红利, 并与客户共享。未来随着金刚线扩产和切割技术进步, 公司光伏硅片切割代工利润弹性有望持续释放。
- 盈利预测与投资建议:预计公司 2022-2024 年归母净利润 4.7、7.2、9.3 亿元,对应 PE 30、20、15 倍,维持 “增持” 评级。
- 风险提示:硅片扩产不及预期,公司代工业务利润波动风险,市场竞争加剧。
<table><thead><tr><th>股票数据</th><th></th></tr></thead><tr><td>总股本(百万股):</td><td>227.92</td></tr><tr><td>流通 A 股(百万股):</td><td>167.01</td></tr><tr><td>52 周内股价区间(元):</td><td>21.60-97.40</td></tr><tr><td>总市值(百万元):</td><td>18,785.44</td></tr><tr><td>总资产(百万元):</td><td>3,508.81</td></tr><tr><td>每股净资产(元):</td><td>5.50</td></tr><tr><td>咨料来源,公司公告</td><td></td></tr></table>
<table><thead><tr><th>主要财务数据及预测</th><th></th><th></th><th></th><th></th><th></th></tr></thead><tr><td></td><td>2020</td><td>2021</td><td>2022E</td><td>2023E</td><td>2024E</td></tr><tr><td>营业收入(百万元)</td><td>746</td><td>1,567</td><td>3,684</td><td>5,056</td><td>5,752</td></tr><tr><td>(+/-)YOY(%)</td><td>4.5\%</td><td>110.0\%</td><td>135.1\%</td><td>37.2\%</td><td>13.8\%</td></tr><tr><td>净利润(百万元)</td><td>59</td><td>173</td><td>471</td><td>717</td><td>933</td></tr><tr><td>(+/-)YOY(%)</td><td>83.8\%</td><td>193.4\%</td><td>172.8\%</td><td>52.2\%</td><td>30.1\%</td></tr><tr><td>全面摊薄 EPS(元)</td><td>0.43</td><td>1.07</td><td>2.91</td><td>4.43</td><td>5.77</td></tr><tr><td>毛利率(\%)</td><td>35.3\%</td><td>33.7\%</td><td>35.0\%</td><td>36.0\%</td><td>38.0\%</td></tr><tr><td>净资产收益率(\%)</td><td>6.0\%</td><td>15.0\%</td><td>27.9\%</td><td>28.8\%</td><td>26.5\%</td></tr></table>
资料来源: 公司年报 (2020-2021),德邦研究所
备注: 净利润为归属母公司所有者的净利润
## 财务报表分析和预测
| 主要财务指标 | 2021 | $2022 E$ | $2023 E$ | $2024 E$ |
| :--- | ---: | ---: | ---: | ---: |
| 每股指标(元) | | | | |
| 每股收益 | 1.07 | 2.91 | 4.43 | 5.77 |
| 每股净资产 | 7.13 | 10.43 | 15.39 | 21.76 |
| 每股经营现金流 | 0.47 | 1.27 | 4.07 | 5.02 |
| 每股股利 | 0.11 | 0.11 | 0.11 | 0.11 |
| 价值评估(倍) | | | | |
| P/E | 82.90 | 30.47 | 20.02 | 15.38 |
| P/B | 12.44 | 8.50 | 5.76 | 4.08 |
| P/S | 8.52 | 3.62 | 2.64 | 2.32 |
| EV/EBITDA | 49.85 | 24.12 | 15.68 | 11.46 |
| 股息率\% | $0.1 \%$ | $0.1 \%$ | $0.1 \%$ | $0.1 \%$ |
| 盈利能力指标(\%) | | | | |
| 毛利率 | $33.7 \%$ | $35.0 \%$ | $36.0 \%$ | $38.0 \%$ |
| 净利润率 | $11.0 \%$ | $12.8 \%$ | $14.2 \%$ | $16.2 \%$ |
| 净资产收益率 | $15.0 \%$ | $27.9 \%$ | $28.8 \%$ | $26.5 \%$ |
| 资产回报率 | $5.3 \%$ | $7.9 \%$ | $8.5 \%$ | $9.2 \%$ |
| 投资回报率 | $15.3 \%$ | $25.9 \%$ | $24.6 \%$ | $23.7 \%$ |
| 盈利增长(\%) | | | | |
| 营业收入增长率 | $110.0 \%$ | $135.1 \%$ | $37.2 \%$ | $13.8 \%$ |
| EBIT 增长率 | $233.7 \%$ | $150.7 \%$ | $52.3 \%$ | $31.9 \%$ |
| 净利润增长率 | $193.4 \%$ | $172.8 \%$ | $52.2 \%$ | $30.1 \%$ |
| 偿倩能力指标 | | | | |
| 资产负债率 | $64.3 \%$ | $71.5 \%$ | $70.6 \%$ | $65.3 \%$ |
| 流动比率 | 1.2 | 1.2 | 1.3 | 1.4 |
| 速动比率 | 0.9 | 0.9 | 1.0 | 1.1 |
| 现金比率 | 0.2 | 0.1 | 0.2 | 0.3 |
| 经营效率指标 | | | | |
| 应收怅款周转天数 | 161.7 | 165.1 | 164.9 | 164.4 |
| 存货周转天数 | 196.1 | 170.0 | 180.0 | 190.0 |
| 总资产周转率 | 0.5 | 0.6 | 0.6 | 0.6 |
| 固定资产周转率 | 4.2 | 8.6 | 10.3 | 11.1 |
| 现金流量表(百万元) | 2021 | $2022 E$ | 2023E | 2024E |
| :--- | ---: | ---: | ---: | ---: |
| 净利润 | 173 | 471 | 717 | 933 |
| 少数股东损益 | 0 | 0 | 0 | 0 |
| 非现金支出 | 107 | 114 | 133 | 147 |
| 非经营收益 | 17 | 1 | 4 | 14 |
| 营运资金变动 | -220 | -382 | -195 | -283 |
| 经营活动现金流 | 76 | 205 | 658 | 812 |
| 资产 | -83 | -184 | -203 | -169 |
| 投资 | 229 | 0 | 0 | 0 |
| 其他 | 6 | 9 | 13 | 14 |
| 投资活动现金流 | 151 | -175 | -190 | -155 |
| 债权募资 | -80 | 39 | 321 | 64 |
| 股权募资 | 0 | 0 | 0 | 0 |
| 其他活 | -21 | -3 | -14 | -25 |
| 融资活动现金流 | -101 | 36 | 307 | 39 |
| 现金净流量 | 127 | 66 | 775 | 696 |
备注: 表中计算估值指标的收盘价日期为 7 月 19 日
资料来源: 公司年报 (2020-2021), 德邦研究所
| 利润表(百万元) | 2021 | 2022E | 2023E | 2024E |
| :---: | :---: | :---: | :---: | :---: |
| 营业总收入 | 1,567 | 3,684 | 5,056 | 5,752 |
| 营业成本 | 1,038 | 2,394 | 3,236 | 3,567 |
| 毛利率\% | $33.7 \%$ | $35.0 \%$ | $36.0 \%$ | $38.0 \%$ |
| 营业税金及附加 | 6 | 18 | 25 | 29 |
| 营业税金率\% | $0.4 \%$ | $0.5 \%$ | $0.5 \%$ | $0.5 \%$ |
| 营业费用 | 63 | 147 | 193 | 209 |
| 营业费用率\% | $4.0 \%$ | $4.0 \%$ | $3.8 \%$ | $3.6 \%$ |
| 管理费用 | 131 | 313 | 409 | 444 |
| 管理费用率\% | $8.4 \%$ | $8.5 \%$ | $8.1 \%$ | $7.7 \%$ |
| 研发费用 | 117 | 276 | 379 | 431 |
| 研发费用率\% | $7.5 \%$ | $7.5 \%$ | $7.5 \%$ | $7.5 \%$ |
| EBIT | 213 | 534 | 814 | 1,074 |
| 财务费用 | 7 | 1 | 11 | 19 |
| 财务费用率\% | $0.4 \%$ | $0.0 \%$ | $0.2 \%$ | $0.3 \%$ |
| 资产减值损失 | -33 | -63 | -86 | -98 |
| 投资收益 | 5 | 9 | 13 | 14 |
| 营业利润 | 212 | 531 | 800 | 1,040 |
| 营业外收支 | -25 | -8 | -3 | -3 |
| 利润总额 | 187 | 523 | 797 | 1,037 |
| EBITDA | 282 | 582 | 865 | 1,129 |
| 所得税 | 14 | 52 | 80 | 104 |
| 有效所得税率\% | $7.7 \%$ | $10.0 \%$ | $10.0 \%$ | $10.0 \%$ |
| 少数股东损益 | 0 | 0 | 0 | $\mathbf{0}-1-2$ |
| 归属母公司所有者净利润 | 173 | 471 | 717 | 933 |
| 资产负债表(百万元) | 2021 | 2022E | 2023E | $2024 E$ |
| :---: | :---: | :---: | :---: | :---: |
| 货币资金 | 427 | 494 | 1,269 | 1,965 |
| 应收账款及应收票据 | 1,173 | 2,806 | 3,798 | 4,344 |
| 存货 | 558 | 1,115 | 1,596 | 1,857 |
| 其它流动资产 | 266 | 578 | 736 | 778 |
| 流动资产合计 | 2,424 | 4,992 | 7,400 | 8,943 |
| 长期股权投资 | 0 | 0 | 0 | 0 |
| 固定资产 | 370 | 429 | 491 | 516 |
| 在建工程 | 169 | 183 | 205 | 226 |
| 无形资产 | 42 | 56 | 69 | 80 |
| 非流动资产合计 | 811 | 940 | 1,087 | 1,198 |
| 资产总计 | 3,235 | 5,932 | 8,487 | 10,141 |
| 短期借款 | 28 | 68 | 388 | 452 |
| 应付票据及应付账款 | 1,401 | 3,197 | 4,302 | 4,760 |
| 预收账款 | 0 | 0 | 0 | 0 |
| 其它流动负债 | 560 | 887 | 1,214 | 1,314 |
| 流动负债合计 | 1,989 | 4,152 | 5,904 | 6,527 |
| 长期借款 | 0 | 0 | 0 | 0 |
| 其它长期负债 | 92 | 92 | 92 | 92 |
| 非流动负债合计 | 92 | 92 | 92 | 92 |
| 负债总计 | 2,081 | 4,243 | 5,996 | 6,619 |
| 实收资本 | 162 | 162 | 162 | 162 |
| 普通股股东权益 | 1,154 | 1,688 | 2,491 | 3,522 |
| 少数股东权益 | 0 | 0 | 0 | 0 |
| 负债和所有者权益合计 | 3,235 | 5,932 | 8,487 | 10,141 |
## 信息披露
## 分析师与研究助理简介
倪正洋,2021 年加入德邦证券,任研究所大制造组组长、机械行业首席分析师,拥有 5 年机械研究经验,1 年高端装备产业经验,南京大学材料学学士、上海交通大学材料学硕士。2020 年获得 iFinD 机械行业最具人气分析师, 所在团队曾获机械行业 2019 年新财富第三名,2017 年新财富第二名,2017 年金牛奖第二名,2016 年新财富第四名。
## 分析师声明
本人具有中国证券业协会授予的证券投资咨询执业资格,以勤勉的职业态度,独立、客观地出具本报告。本报告所采用的数据和信息均来自市场公开信息, 本人不保证该等信息的准确性或完整性。分析逻辑基于作者的职业理解,清晰准确地反映了作者的研究观点,结论不受任何第三方的授意或影响,特此声明。
## 投资评级说明
1.投资评级的比较和评级标准:
以报告发布后的 6 个月内的市场表现为比较标准,报告发布日后 6 个月内的公司股价(或行业指数)的张跌幅相对同期市场基准指数的涨跌幅;
2.市场基准指数的比较标准:
A 股市场以上证综指或深证成指为基准;香港市场以恒生指数为基准;美国市场以标普 500 或纳斯达克综合指数为基准。
<table>
<tr>
<td rowspan="11">1. 投资评级的比较和评级标准: 以报告发布后的 6 个月内的市场表 现为比较标准,报告发布日后 6 个 月内的公司股价(或行业指数)的 涨跌幅相对同期市场基准指数的涨 跌幅:<br> 2. 市场基准指数的比较标准: A股市场以上证综指或深证成指为基 准; 香港市场以恒生指数为基准; 美 国市场以标普500或纳斯达克综合指 数为基准。</td>
</tr>
<tr>
<td>类型</td>
<td>评级</td>
<td>说明</td>
</tr>
<td rowspan="5">股票评级</td>
</tr>
<tr>
<td>买入</td>
<td>相对强于市场表现 20%以上;</td>
</tr>
<tr>
<td>增持</td>
<td>相对强于市场表现 5% 20%;</td>
</tr>
<tr>
<td>中性</td>
<td>相对市场表现在-5% +5%之间波动;</td>
</tr>
<tr>
<td>减持</td>
<td>相对弱于市场表现 5%以下。</td>
</tr>
<tr>
<td rowspan="4">行业投资评级</td>
</tr>
<tr>
<td>优于大市</td>
<td>预期行业整体回报高于基准指数整体水平10%以上;</td>
</tr>
<tr>
<td>中性</td>
<td>预期行业整体回报介于基准指数整体水平-10%与 10%之间;</td>
</tr>
<tr>
<td>弱于大市</td>
<td>预期行业整体回报低于基准指数整体水平 10%以下。</td>
</tr>
<tr>
</table>
## 法律声明
本报告仅供德邦证券股份有限公司(以下简称 “本公司”)的客户使用。本公司不会因接收人收到本报告而视其为客户。在任何情况下,本报告中的信息或所表述的意见并不构成对任何人的投资建议。在任何情况下,本公司不对任何人因使用本报告中的任何内容所引致的任何损失负任何责任。
本报告所载的资料、意见及推测仅反映本公司于发布本报告当日的判断,本报告所指的证券或投资标的的价格、价值及投资收入可能会波动。在不同时期,本公司可发出与本报告所载资料、意见及推测不一致的报告。
市场有风险,投资需谨慎。本报告所载的信息、材料及结论只提供特定客户作参考,不构成投资建议,也没有考虑到个别客户特殊的投资目标、财务状况或需要。客户应考虑本报告中的任何意见或建议是否符合其特定状况。在法律许可的情况下,德邦证券及其所属关联机构可能会持有报告中提到的公司所发行的证券并进行交易,还可能为这些公司提供投资银行服务或其他服务。
本报告仅向特定客户传送,未经德邦证券研究所书面授权,本研究报告的任何部分均不得以任何方式制作任何形式的拷贝、复印件或复制品,或再次分发给任何其他人,或以任何侵犯本公司版权的其他方式使用。所有本报告中使用的商标、服务标记及标记均为本公司的商标、服务标记及标记。如欲引用或转载本文内容, 务必联络德邦证券研究所并获得许可, 并需注明出处为德邦证券研究所,且不得对本文进行有悖原意的引用和删改。
根据中国证监会核发的经营证券业务许可,德邦证券股份有限公司的经营范围包括证券投资咨询业务。
\ No newline at end of file
import pytest
import json
import os
import shutil
from conf import conf
import os
import json
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from lib import calculate_score
import shutil
pdf_res_path = conf.conf["pdf_res_path"]
code_path = conf.conf["code_path"]
pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']
class TestCliCuda:
"""
test cli cuda
"""
"""test cli cuda."""
def test_pdf_sdk_cuda(self):
"""
pdf sdk cuda
"""
"""pdf sdk cuda."""
clean_magicpdf(pdf_res_path)
pdf_to_markdown()
fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
fr = open(os.path.join(pdf_dev_path, 'result.json'), 'r', encoding='utf-8')
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"]
os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
last_simscore = last_score['average_sim_score']
last_editdistance = last_score['average_edit_distance']
last_bleu = last_score['average_bleu_score']
os.system(f'python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}')
now_score = get_score()
print ("now_score:", now_score)
if not os.path.exists(os.path.join(pdf_dev_path, "ci")):
os.makedirs(os.path.join(pdf_dev_path, "ci"), exist_ok=True)
fw = open(os.path.join(pdf_dev_path, "ci", "result.json"), "w+", encoding="utf-8")
fw.write(json.dumps(now_score) + "\n")
now_simscore = now_score["average_sim_score"]
now_editdistance = now_score["average_edit_distance"]
now_bleu = now_score["average_bleu_score"]
print ('now_score:', now_score)
if not os.path.exists(os.path.join(pdf_dev_path, 'ci')):
os.makedirs(os.path.join(pdf_dev_path, 'ci'), exist_ok=True)
fw = open(os.path.join(pdf_dev_path, 'ci', 'result.json'), 'w+', encoding='utf-8')
fw.write(json.dumps(now_score) + '\n')
now_simscore = now_score['average_sim_score']
now_editdistance = now_score['average_edit_distance']
now_bleu = now_score['average_bleu_score']
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
def pdf_to_markdown():
"""
pdf to md
"""
"""pdf to md."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, "pdf")
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, "pdf", f"{demo_name}.pdf")
cmd = "magic-pdf pdf-command --pdf %s --inside_model true" % (pdf_path)
os.system(cmd)
dir_path = os.path.join(pdf_dev_path, "mineru")
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
cmd = 'magic-pdf pdf-command --pdf %s --inside_model true' % (pdf_path)
os.system(cmd)
dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f"{demo_name}.md")
src_path = os.path.join(pdf_res_path, demo_name, "auto", f"{demo_name}.md")
res_path = os.path.join(dir_path, f'{demo_name}.md')
src_path = os.path.join(pdf_res_path, demo_name, 'auto', f'{demo_name}.md')
shutil.copy(src_path, res_path)
def get_score():
"""
get score
"""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
score.calculate_similarity_total("mineru", pdf_dev_path)
"""get score."""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, 'result.json'))
score.calculate_similarity_total('mineru', pdf_dev_path)
res = score.summary_scores()
return res
def clean_magicpdf(pdf_res_path):
"""
clean magicpdf
"""
cmd = "rm -rf %s" % (pdf_res_path)
"""clean magicpdf."""
cmd = 'rm -rf %s' % (pdf_res_path)
os.system(cmd)
......@@ -6,13 +6,14 @@ from conf import conf
from lib import common
import time
import magic_pdf.model as model_config
from magic_pdf.pipe.UNIPipe import UNIPipe
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.read_api import read_local_images
from magic_pdf.data.read_api import read_local_office
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
model_config.__use_inside_model__ = True
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']
......@@ -31,7 +32,7 @@ class TestCli:
yield
@pytest.mark.P0
def test_pdf_auto_sdk(self):
def test_pdf_local_sdk(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
......@@ -40,35 +41,52 @@ class TestCli:
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
print(pdf_path)
pdf_bytes = open(pdf_path, 'rb').read()
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = FileBasedDataWriter(local_image_dir)
model_json = list()
jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
exit(1)
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f'{demo_name}.md')
common.delete_file(res_path)
with open(res_path, 'w+', encoding='utf-8') as f:
f.write(md_content)
common.sdk_count_folders_and_check_contents(res_path)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_path)
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
common.delete_file(dir_path)
### draw model result on each page
infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
### dump markdown
md_content = pipe_result.get_markdown(image_dir)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_ocr_sdk(self):
"""pdf sdk ocr test."""
time.sleep(2)
def test_pdf_s3_sdk(self):
"""pdf s3 sdk test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
......@@ -76,66 +94,97 @@ class TestCli:
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
print(pdf_path)
pdf_bytes = open(pdf_path, 'rb').read()
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = FileBasedDataWriter(local_image_dir)
model_json = list()
jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
exit(1)
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f'{demo_name}.md')
common.delete_file(res_path)
with open(res_path, 'w+', encoding='utf-8') as f:
f.write(md_content)
common.sdk_count_folders_and_check_contents(res_path)
pass
@pytest.mark.P0
def test_pdf_txt_sdk(self):
"""pdf sdk txt test."""
time.sleep(2)
def test_pdf_local_ppt(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
pdf_path = os.path.join(pdf_dev_path, 'ppt')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
if pdf_file.endswith('.pptx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
pdf_bytes = open(pdf_path, 'rb').read()
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
pdf_path = os.path.join(pdf_dev_path, 'ppt', f'{demo_name}.pptx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = FileBasedDataWriter(local_image_dir)
model_json = list()
jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
exit(1)
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
name_without_suff = os.path.basename(pdf_path).split(".pptx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f'{demo_name}.md')
common.delete_file(res_path)
with open(res_path, 'w+', encoding='utf-8') as f:
f.write(md_content)
common.sdk_count_folders_and_check_contents(res_path)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_local_image(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.jpg'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'images', f'{demo_name}.jpg')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".jpg")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(dir_path)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_images(pdf_path)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_local_image_dir(self):
"""local image dir."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
dir_path = os.path.join(pdf_dev_path, 'mineru')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
common.delete_file(dir_path)
dss = read_local_images(pdf_path, suffixes=['.png', '.jpg'])
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{count}.md", image_dir)
count += 1
common.sdk_count_folders_and_check_contents(dir_path)
def test_local_doc_parse(self):
"""
doc 解析
"""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'doc')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.docx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'doc', f'{demo_name}.docx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".docx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_cli_auto(self):
"""magic_pdf cli test auto."""
......@@ -154,7 +203,7 @@ class TestCli:
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'auto'))
@pytest.mark.P0
def test_pdf_cli_txt(self):
"""magic_pdf cli test txt."""
......@@ -274,31 +323,6 @@ class TestCli:
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_s3_sdk_suto(self):
"""
test s3 sdk auto.
"""
time.sleep(2)
pdf_ak = os.getenv('pdf_ak')
print (pdf_ak)
pdf_sk = os.environ.get('pdf_sk', "")
pdf_bucket = os.environ.get('bucket', "")
pdf_endpoint = os.environ.get('pdf_endpoint', "")
s3_pdf_path = conf.conf["s3_pdf_path"]
image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
prefix = "mineru/test/output"
reader = S3DataReader(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
# = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
image_writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
pdf_bytes = reader.read(s3_pdf_path)
model_list = []
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
assert len(md_content) > 0
@pytest.mark.P1
def test_local_magic_pdf_open_st_table(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment