Unverified Commit 0d5a1c74 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1292 from dt-yy/dev

parents 379cf150 905c4ae5
...@@ -2,6 +2,8 @@ import os ...@@ -2,6 +2,8 @@ import os
conf = { conf = {
"code_path": os.environ.get('GITHUB_WORKSPACE'), "code_path": os.environ.get('GITHUB_WORKSPACE'),
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev", "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
#"code_path": "/home/quyuan/ci/actions-runner/MinerU",
#"pdf_dev_path": "/home/quyuan/ci/actions-runner/MinerU/tests/test_cli/pdf_dev",
"pdf_res_path": "/tmp/magic-pdf", "pdf_res_path": "/tmp/magic-pdf",
"jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl", "jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl",
"s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test_rearch_report.pdf" "s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test_rearch_report.pdf"
......
...@@ -12,6 +12,10 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter ...@@ -12,6 +12,10 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
pdf_res_path = conf.conf['pdf_res_path'] pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path'] code_path = conf.conf['code_path']
...@@ -40,101 +44,29 @@ class TestCli: ...@@ -40,101 +44,29 @@ class TestCli:
demo_names.append(pdf_file.split('.')[0]) demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names: for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf') pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
print(pdf_path)
pdf_bytes = open(pdf_path, 'rb').read()
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images') local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
image_writer = FileBasedDataWriter(local_image_dir)
model_json = list()
jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
exit(1)
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f'{demo_name}.md')
common.delete_file(res_path)
with open(res_path, 'w+', encoding='utf-8') as f:
f.write(md_content)
common.sdk_count_folders_and_check_contents(res_path)
@pytest.mark.P0
def test_pdf_ocr_sdk(self):
"""pdf sdk ocr test."""
time.sleep(2)
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
print(pdf_path)
pdf_bytes = open(pdf_path, 'rb').read()
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = FileBasedDataWriter(local_image_dir)
model_json = list()
jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
exit(1)
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
dir_path = os.path.join(pdf_dev_path, 'mineru') dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path): image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
os.makedirs(dir_path, exist_ok=True) reader1 = FileBasedDataReader("")
res_path = os.path.join(dir_path, f'{demo_name}.md') pdf_bytes = reader1.read(pdf_path)
common.delete_file(res_path) ds = PymuDocDataset(pdf_bytes)
with open(res_path, 'w+', encoding='utf-8') as f: ## inference
f.write(md_content) if ds.classify() == SupportedPdfParseMethod.OCR:
common.sdk_count_folders_and_check_contents(res_path) infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
@pytest.mark.P0 pipe_result = infer_result.pipe_ocr_mode(image_writer)
def test_pdf_txt_sdk(self):
"""pdf sdk txt test."""
time.sleep(2)
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
pdf_bytes = open(pdf_path, 'rb').read()
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = FileBasedDataWriter(local_image_dir)
model_json = list()
jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else: else:
exit(1) infer_result = ds.apply(doc_analyze, ocr=False)
pipe.pipe_parse() ## pipeline
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none') pipe_result = infer_result.pipe_txt_mode(image_writer)
dir_path = os.path.join(pdf_dev_path, 'mineru') common.delete_file(dir_path)
if not os.path.exists(dir_path): infer_result.draw_model(os.path.join(dir_path, f"{demo_name}_model.pdf"))
os.makedirs(dir_path, exist_ok=True) pipe_result.draw_layout(os.path.join(dir_path, f"{demo_name}_layout.pdf"))
res_path = os.path.join(dir_path, f'{demo_name}.md') pipe_result.draw_span(os.path.join(dir_path, f"{demo_name}_spans.pdf"))
common.delete_file(res_path) pipe_result.dump_md(md_writer, f"{demo_name}.md", image_dir)
with open(res_path, 'w+', encoding='utf-8') as f: pipe_result.dump_content_list(md_writer, f"{demo_name}_content_list.json", image_dir)
f.write(md_content) common.sdk_count_folders_and_check_contents(dir_path)
common.sdk_count_folders_and_check_contents(res_path)
@pytest.mark.P0 @pytest.mark.P0
def test_pdf_cli_auto(self): def test_pdf_cli_auto(self):
...@@ -274,8 +206,9 @@ class TestCli: ...@@ -274,8 +206,9 @@ class TestCli:
logging.info(cmd) logging.info(cmd)
os.system(cmd) os.system(cmd)
@pytest.mark.P1 @pytest.mark.P1
def test_s3_sdk_suto(self): def test_s3_sdk_auto(self):
""" """
test s3 sdk auto. test s3 sdk auto.
""" """
...@@ -289,16 +222,46 @@ class TestCli: ...@@ -289,16 +222,46 @@ class TestCli:
image_dir = "s3://" + pdf_bucket + "/mineru/test/output" image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
prefix = "mineru/test/output" prefix = "mineru/test/output"
reader = S3DataReader(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint) reader = S3DataReader(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
# = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint) # = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
image_writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint) image_writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
pdf_bytes = reader.read(s3_pdf_path) local_dir = "output"
model_list = [] name_without_suff = os.path.basename(s3_pdf_path).split(".")[0]
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify() # read bytes
pipe.pipe_analyze() pdf_bytes = reader.read(s3_pdf_path) # read the pdf content
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") # proc
assert len(md_content) > 0 ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
### dump content list
pipe_result.dump_content_list(writer, f"{name_without_suff}_content_list.json", image_dir)
@pytest.mark.P1 @pytest.mark.P1
def test_local_magic_pdf_open_st_table(self): def test_local_magic_pdf_open_st_table(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment