Commit 8df8737e authored by quyuan's avatar quyuan
Browse files

fix: add magic-pdf-dev case

parent 20212a37
...@@ -659,3 +659,7 @@ specific requirements. ...@@ -659,3 +659,7 @@ specific requirements.
if any, to sign a "copyright disclaimer" for the program, if necessary. if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see For more information on this, and how to apply and follow the GNU AGPL, see
<https://www.gnu.org/licenses/>. <https://www.gnu.org/licenses/>.
$^1$
\ No newline at end of file
import os
import shutil
def move_pdfs(root_folder, destination_folder):
# 遍历根目录及其子目录中的所有文件
for root, dirs, files in os.walk(root_folder):
for file in files:
if file.endswith('.pdf'):
# 构建完整的文件路径
src_path = os.path.join(root, file)
# 构建目标路径
dst_path = os.path.join(destination_folder, file)
# 移动文件
shutil.move(src_path, dst_path)
print(f'Moved {file} to {destination_folder}')
# 使用方法
root_folder = r'D:\mineru\datasets\datasets' # 源文件夹路径
destination_folder = r'D:\mineru\datasets\pdf' # 目标文件夹路径
# 创建目标文件夹如果不存在
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
move_pdfs(root_folder, destination_folder)
\ No newline at end of file
File added
...@@ -8,7 +8,8 @@ while true; do ...@@ -8,7 +8,8 @@ while true; do
# prepare env # prepare env
source activate MinerU source activate MinerU
pip install -r requirements-qa.txt pip install -r requirements-qa.txt
pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple pip uninstall magic-pdf
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
exit_code=$? exit_code=$?
if [ $exit_code -eq 0 ]; then if [ $exit_code -eq 0 ]; then
......
...@@ -2,6 +2,6 @@ import os ...@@ -2,6 +2,6 @@ import os
conf = { conf = {
"code_path": os.environ.get('GITHUB_WORKSPACE'), "code_path": os.environ.get('GITHUB_WORKSPACE'),
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev", "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
"pdf_res_path": "/tmp/magic-pdf" "pdf_res_path": "/tmp/magic-pdf",
} "jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl"
}
\ No newline at end of file
This diff is collapsed.
...@@ -178,6 +178,56 @@ class TestCli: ...@@ -178,6 +178,56 @@ class TestCli:
common.cli_count_folders_and_check_contents( common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'ocr')) os.path.join(res_path, demo_name, 'ocr'))
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_txt(self):
"""magic_pdf_dev cli local txt."""
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_ocr(self):
"""magic_pdf_dev cli local ocr."""
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_auto(self):
"""magic_pdf_dev cli local auto."""
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_txt(self):
"""magic_pdf_dev cli s3 txt."""
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_ocr(self):
"""magic_pdf_dev cli s3 ocr."""
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_auto(self):
"""magic_pdf_dev cli s3 auto."""
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
if __name__ == '__main__': if __name__ == '__main__':
pytest.main() pytest.main()
""" """
bench test performance
""" """
import os import os
import shutil import shutil
...@@ -12,34 +12,16 @@ code_path = os.environ.get('GITHUB_WORKSPACE') ...@@ -12,34 +12,16 @@ code_path = os.environ.get('GITHUB_WORKSPACE')
pdf_dev_path = conf.conf["pdf_dev_path"] pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf["pdf_res_path"] pdf_res_path = conf.conf["pdf_res_path"]
class TestBench(): class TestTable():
""" """
test bench test table
""" """
def test_ci_ben(self): def test_perf_close_table(self):
""" """
ci benchmark test perf when close table
""" """
fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"]
os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
now_score = get_score()
print ("now_score:", now_score)
if not os.path.exists(os.path.join(pdf_dev_path, "ci")):
os.makedirs(os.path.join(pdf_dev_path, "ci"), exist_ok=True)
fw = open(os.path.join(pdf_dev_path, "ci", "result.json"), "w+", encoding="utf-8")
fw.write(json.dumps(now_score) + "\n")
now_simscore = now_score["average_sim_score"]
now_editdistance = now_score["average_edit_distance"]
now_bleu = now_score["average_bleu_score"]
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
def get_score(): def get_score():
......
"""
test table case
"""
import os
import shutil
import json
from lib import calculate_score
import pytest
from conf import conf
code_path = os.environ.get('GITHUB_WORKSPACE')
pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf["pdf_res_path"]
class TestTable():
"""
test table
"""
def test_paddle_table_master_cuda(self):
"""
select table: paddle table master,mode is cuda
"""
def test_paddle_table_master_cpu(self):
"""
select table: paddle table master, mode is cpu
"""
def test_st_table_cuda(self):
"""
select table: ST, mode is cuda
"""
def test_st_table_cpu(self):
"""
select table: ST, mode is cpu
"""
def test_close_table_cuda(self):
"""
close table, mode is cuda
"""
def get_score():
"""
get score
"""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
score.calculate_similarity_total("mineru", pdf_dev_path)
res = score.summary_scores()
return res
...@@ -4,10 +4,10 @@ from magic_pdf.model.ppTableModel import ppTableModel ...@@ -4,10 +4,10 @@ from magic_pdf.model.ppTableModel import ppTableModel
class TestppTableModel(unittest.TestCase): class TestppTableModel(unittest.TestCase):
def test_image2html(self): def test_image2html(self):
img = Image.open("tests/test_table/assets/table.jpg") img = Image.open("tests/unittest/test_table/assets/table.jpg")
# 修改table模型路径 # 修改table模型路径
config = {"device": "cuda", config = {"device": "cuda",
"model_dir": "D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"} "model_dir": "/home/quyuan/PDF-Extract-Kit/models/TabRec/TableMaster"}
table_model = ppTableModel(config) table_model = ppTableModel(config)
res = table_model.img2html(img) res = table_model.img2html(img)
true_value = """<td><table border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n""" true_value = """<td><table border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment