Merge pull request #3026 from Sidney233/dev

Dev

Merge pull request #3026 from Sidney233/dev
Dev
85a4750d · Xiaomeng Zhao · GitHub · 206ed770 · a7e75dc0 · 206ed770
Unverified Commit 85a4750d authored Jul 16, 2025 by Xiaomeng Zhao Committed by GitHub Jul 16, 2025
20 changed files
--- a/tests/test_cli/pdf_dev/test_model.json
+++ b/tests/test_cli/pdf_dev/test_model.json
--- a/tests/test_cli/test_bench.py
+++ b/tests/test_cli/test_bench.py
-"""
-bench
-"""
-import os
-import shutil
-import json
-from lib import calculate_score
-import pytest
-from conf import conf
-
-code_path = os.environ.get('GITHUB_WORKSPACE')
-pdf_dev_path = conf.conf["pdf_dev_path"]
-pdf_res_path = conf.conf["pdf_res_path"]
-
-class TestBench():
-    """
-    test bench
-    """
-    def test_ci_ben(self):
-        """
-        ci benchmark
-        """
-        fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
-        lines = fr.readlines()
-        last_line = lines[-1].strip()
-        last_score = json.loads(last_line)
-        last_simscore = last_score["average_sim_score"]
-        last_editdistance = last_score["average_edit_distance"]
-        last_bleu = last_score["average_bleu_score"]
-        os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
-        now_score = get_score()
-        print ("now_score:", now_score)
-        if not os.path.exists(os.path.join(pdf_dev_path, "ci")):
-            os.makedirs(os.path.join(pdf_dev_path, "ci"), exist_ok=True)
-        fw = open(os.path.join(pdf_dev_path, "ci", "result.json"), "w+", encoding="utf-8")
-        fw.write(json.dumps(now_score) + "\n")
-        now_simscore = now_score["average_sim_score"]
-        now_editdistance = now_score["average_edit_distance"]
-        now_bleu = now_score["average_bleu_score"]
-        assert last_simscore <= now_simscore
-        assert last_editdistance <= now_editdistance
-        assert last_bleu <= now_bleu
-
-
-def get_score():
-    """
-    get score
-    """
-    score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
-    score.calculate_similarity_total("mineru", pdf_dev_path)
-    res = score.summary_scores()
-    return res
-
-
--- a/tests/test_cli/test_bench_gpu.py
+++ b/tests/test_cli/test_bench_gpu.py
-
-import json
-import os
-import shutil
-
-from conf import conf
-from lib import calculate_score
-
-pdf_res_path = conf.conf['pdf_res_path']
-code_path = conf.conf['code_path']
-pdf_dev_path = conf.conf['pdf_dev_path']
-class TestCliCuda:
-    """test cli cuda."""
-    def test_pdf_sdk_cuda(self):
-        """pdf sdk cuda."""
-        clean_magicpdf(pdf_res_path)
-        pdf_to_markdown()
-        fr = open(os.path.join(pdf_dev_path, 'result.json'), 'r', encoding='utf-8')
-        lines = fr.readlines()
-        last_line = lines[-1].strip()
-        last_score = json.loads(last_line)
-        last_simscore = last_score['average_sim_score']
-        last_editdistance = last_score['average_edit_distance']
-        last_bleu = last_score['average_bleu_score']
-        os.system(f'python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}')
-        now_score = get_score()
-        print ('now_score:', now_score)
-        if not os.path.exists(os.path.join(pdf_dev_path, 'ci')):
-            os.makedirs(os.path.join(pdf_dev_path, 'ci'), exist_ok=True)
-        fw = open(os.path.join(pdf_dev_path, 'ci', 'result.json'), 'w+', encoding='utf-8')
-        fw.write(json.dumps(now_score) + '\n')
-        now_simscore = now_score['average_sim_score']
-        now_editdistance = now_score['average_edit_distance']
-        now_bleu = now_score['average_bleu_score']
-        assert last_simscore <= now_simscore
-        assert last_editdistance <= now_editdistance
-        assert last_bleu <= now_bleu
-
-def pdf_to_markdown():
-    """pdf to md."""
-    demo_names = list()
-    pdf_path = os.path.join(pdf_dev_path, 'pdf')
-    for pdf_file in os.listdir(pdf_path):
-        if pdf_file.endswith('.pdf'):
-            demo_names.append(pdf_file.split('.')[0])
-    for demo_name in demo_names:
-        pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
-        cmd = 'magic-pdf pdf-command --pdf %s --inside_model true' % (pdf_path)
-        os.system(cmd)
-        dir_path = os.path.join(pdf_dev_path, 'mineru')
-        if not os.path.exists(dir_path):
-            os.makedirs(dir_path, exist_ok=True)
-        res_path = os.path.join(dir_path, f'{demo_name}.md')
-        src_path = os.path.join(pdf_res_path, demo_name, 'auto', f'{demo_name}.md')
-        shutil.copy(src_path, res_path)
-
-
-
-def get_score():
-    """get score."""
-    score = calculate_score.Scoring(os.path.join(pdf_dev_path, 'result.json'))
-    score.calculate_similarity_total('mineru', pdf_dev_path)
-    res = score.summary_scores()
-    return res
-
-
-def clean_magicpdf(pdf_res_path):
-    """clean magicpdf."""
-    cmd = 'rm -rf %s' % (pdf_res_path)
-    os.system(cmd)
--- a/tests/test_cli/test_cli_sdk.py
+++ b/tests/test_cli/test_cli_sdk.py
-"""test cli and sdk."""
-import logging
-import os
-import pytest
-from conf import conf
-from lib import common
-import time
-import magic_pdf.model as model_config
-from magic_pdf.data.read_api import read_local_images
-from magic_pdf.data.read_api import read_local_office
-from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
-from magic_pdf.config.make_content_config import DropMode, MakeMode
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.config.enums import SupportedPdfParseMethod
-pdf_res_path = conf.conf['pdf_res_path']
-code_path = conf.conf['code_path']
-pdf_dev_path = conf.conf['pdf_dev_path']
-magic_pdf_config = "/home/quyuan/magic-pdf.json"
-
-class TestCli:
-    """test cli."""
-    @pytest.fixture(autouse=True)
-    def setup(self):
-        """
-        init
-        """
-        common.clear_gpu_memory()
-        common.update_config_file(magic_pdf_config, "device-mode", "cuda")
-        # 这里可以添加任何前置操作
-        yield
-
-    @pytest.mark.P0
-    def test_pdf_local_sdk(self):
-        """pdf sdk auto test."""
-        demo_names = list()
-        pdf_path = os.path.join(pdf_dev_path, 'pdf')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.pdf'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
-            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
-            image_dir = str(os.path.basename(local_image_dir))
-            name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
-            dir_path = os.path.join(pdf_dev_path, 'mineru')
-            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
-            reader1 = FileBasedDataReader("")
-            pdf_bytes = reader1.read(pdf_path)
-            ds = PymuDocDataset(pdf_bytes)
-            ## inference
-            if ds.classify() == SupportedPdfParseMethod.OCR:
-                infer_result = ds.apply(doc_analyze, ocr=True)
-                ## pipeline
-                pipe_result = infer_result.pipe_ocr_mode(image_writer)
-            else:
-                infer_result = ds.apply(doc_analyze, ocr=False)
-                ## pipeline
-                pipe_result = infer_result.pipe_txt_mode(image_writer)
-            common.delete_file(dir_path)
-            ### draw model result on each page
-            infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
-
-            ### get model inference result
-            model_inference_result = infer_result.get_infer_res()
-
-            ### draw layout result on each page
-            pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
-
-            ### draw spans result on each page
-            pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
-
-            ### dump markdown
-            md_content = pipe_result.get_markdown(image_dir)
-            pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-            ### get content list content
-            content_list_content = pipe_result.get_content_list(image_dir)
-            pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-            
-            ### get middle json
-            middle_json_content = pipe_result.get_middle_json()
-            ### dump middle json
-            pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
-            common.sdk_count_folders_and_check_contents(dir_path)
-
-    @pytest.mark.P0
-    def test_pdf_s3_sdk(self):
-        """pdf s3 sdk test."""
-        demo_names = list()
-        pdf_path = os.path.join(pdf_dev_path, 'pdf')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.pdf'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
-            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
-            image_dir = str(os.path.basename(local_image_dir))
-            name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
-            dir_path = os.path.join(pdf_dev_path, 'mineru')
-            pass
-
-    @pytest.mark.P0
-    def test_pdf_local_ppt(self):
-        """pdf sdk auto test."""
-        demo_names = list()
-        pdf_path = os.path.join(pdf_dev_path, 'ppt')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.pptx'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            pdf_path = os.path.join(pdf_dev_path, 'ppt', f'{demo_name}.pptx')
-            local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
-            image_dir = str(os.path.basename(local_image_dir))
-            name_without_suff = os.path.basename(pdf_path).split(".pptx")[0]
-            dir_path = os.path.join(pdf_dev_path, 'mineru')
-            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
-            ds = read_local_office(pdf_path)[0]
-            common.delete_file(dir_path)
-            
-            ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)          
-            common.sdk_count_folders_and_check_contents(dir_path)
-
-
-
-    @pytest.mark.P0
-    def test_pdf_local_image(self):
-        """pdf sdk auto test."""
-        demo_names = list()
-        pdf_path = os.path.join(pdf_dev_path, 'images')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.jpg'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            pdf_path = os.path.join(pdf_dev_path, 'images', f'{demo_name}.jpg')
-            local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
-            image_dir = str(os.path.basename(local_image_dir))
-            name_without_suff = os.path.basename(pdf_path).split(".jpg")[0]
-            dir_path = os.path.join(pdf_dev_path, 'mineru')
-            common.delete_file(dir_path)
-            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
-            ds = read_local_images(pdf_path)[0]
-            ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-            md_writer, f"{name_without_suff}.md", image_dir)
-            common.sdk_count_folders_and_check_contents(dir_path)
-
-
-    @pytest.mark.P0
-    def test_local_image_dir(self):
-        """local image dir."""
-        demo_names = list()
-        pdf_path = os.path.join(pdf_dev_path, 'images')
-        dir_path = os.path.join(pdf_dev_path, 'mineru')
-        local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
-        image_dir = str(os.path.basename(local_image_dir))
-        image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
-        common.delete_file(dir_path)
-        dss = read_local_images(pdf_path, suffixes=['.png', '.jpg'])
-        count = 0
-        for ds in dss:
-            ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{count}.md", image_dir)
-            count += 1
-        common.sdk_count_folders_and_check_contents(dir_path)
-
-    def test_local_doc_parse(self):
-        """
-        doc 解析
-        """
-        demo_names = list()
-        pdf_path = os.path.join(pdf_dev_path, 'doc')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.docx'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            pdf_path = os.path.join(pdf_dev_path, 'doc', f'{demo_name}.docx')
-            local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
-            image_dir = str(os.path.basename(local_image_dir))
-            name_without_suff = os.path.basename(pdf_path).split(".docx")[0]
-            dir_path = os.path.join(pdf_dev_path, 'mineru')
-            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
-            ds = read_local_office(pdf_path)[0]
-            common.delete_file(dir_path)
-            
-            ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)          
-            common.sdk_count_folders_and_check_contents(dir_path)
-
-
-    @pytest.mark.P0
-    def test_pdf_cli_auto(self):
-        """magic_pdf cli test auto."""
-        time.sleep(2)
-        demo_names = []
-        pdf_path = os.path.join(pdf_dev_path, 'pdf')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.pdf'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            res_path = os.path.join(pdf_dev_path, 'mineru')
-            common.delete_file(res_path)
-            cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
-                pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
-            logging.info(cmd)
-            os.system(cmd)
-            common.cli_count_folders_and_check_contents(
-                os.path.join(res_path, demo_name, 'auto'))
-  
-    @pytest.mark.P0
-    def test_pdf_cli_txt(self):
-        """magic_pdf cli test txt."""
-        time.sleep(2)
-        demo_names = []
-        pdf_path = os.path.join(pdf_dev_path, 'pdf')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.pdf'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            res_path = os.path.join(pdf_dev_path, 'mineru')
-            common.delete_file(res_path)
-            cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
-                pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
-            logging.info(cmd)
-            os.system(cmd)
-            common.cli_count_folders_and_check_contents(
-                os.path.join(res_path, demo_name, 'txt'))
-   
-    @pytest.mark.P0
-    def test_pdf_cli_ocr(self):
-        """magic_pdf cli test ocr."""
-        time.sleep(2)
-        demo_names = []
-        pdf_path = os.path.join(pdf_dev_path, 'pdf')
-        for pdf_file in os.listdir(pdf_path):
-            if pdf_file.endswith('.pdf'):
-                demo_names.append(pdf_file.split('.')[0])
-        for demo_name in demo_names:
-            res_path = os.path.join(pdf_dev_path, 'mineru')
-            common.delete_file(res_path)
-            cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
-                pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
-            logging.info(cmd)
-            os.system(cmd)
-            common.cli_count_folders_and_check_contents(
-                os.path.join(res_path, demo_name, 'ocr'))
-    
-    @pytest.mark.skip(reason='out-of-date api')
-    @pytest.mark.P1
-    def test_pdf_dev_cli_local_jsonl_txt(self):
-        """magic_pdf_dev cli local txt."""
-        time.sleep(2)
-        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
-        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
-        logging.info(cmd)
-        os.system(cmd)
-
-    @pytest.mark.skip(reason='out-of-date api')
-    @pytest.mark.P1
-    def test_pdf_dev_cli_local_jsonl_ocr(self):
-        """magic_pdf_dev cli local ocr."""
-        time.sleep(2)
-        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
-        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
-        logging.info(cmd)
-        os.system(cmd)
-
-    @pytest.mark.skip(reason='out-of-date api')
-    @pytest.mark.P1
-    def test_pdf_dev_cli_local_jsonl_auto(self):
-        """magic_pdf_dev cli local auto."""
-        time.sleep(2)
-        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
-        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
-        logging.info(cmd)
-        os.system(cmd)
-    
-    @pytest.mark.skip(reason='out-of-date api')
-    @pytest.mark.P1
-    def test_pdf_dev_cli_s3_jsonl_txt(self):
-        """magic_pdf_dev cli s3 txt."""
-        time.sleep(2)
-        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
-        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
-        logging.info(cmd)
-        os.system(cmd)
-
-    @pytest.mark.skip(reason='out-of-date api')
-    @pytest.mark.P1
-    def test_pdf_dev_cli_s3_jsonl_ocr(self):
-        """magic_pdf_dev cli s3 ocr."""
-        time.sleep(2)
-        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
-        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
-        logging.info(cmd)
-        os.system(cmd)
-
-    @pytest.mark.skip(reason='out-of-date api')
-    @pytest.mark.P1
-    def test_pdf_dev_cli_s3_jsonl_auto(self):
-        """magic_pdf_dev cli s3 auto."""
-        time.sleep(2)
-        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
-        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
-        logging.info(cmd)
-        os.system(cmd)
-
-    @pytest.mark.P1
-    def test_pdf_dev_cli_pdf_json_auto(self):
-        """magic_pdf_dev cli pdf+json auto."""
-        time.sleep(2)
-        json_path = os.path.join(pdf_dev_path, 'test_model.json')
-        pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
-        cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
-        logging.info(cmd)
-        os.system(cmd)
-   
-    @pytest.mark.skip(reason='out-of-date api')
-    @pytest.mark.P1
-    def test_pdf_dev_cli_pdf_json_ocr(self):
-        """magic_pdf_dev cli pdf+json ocr."""
-        time.sleep(2)
-        json_path = os.path.join(pdf_dev_path, 'test_model.json')
-        pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
-        cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
-        logging.info(cmd)
-        os.system(cmd)
-    
-    @pytest.mark.P1
-    def test_local_magic_pdf_open_rapidai_table(self):
-        """magic pdf cli open rapid ai table."""
-        time.sleep(2)
-        #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
-        #os.system(pre_cmd)
-        value = {
-        "model": "rapid_table",
-        "enable": True,
-        "sub_model": "slanet_plus",
-        "max_time": 400
-        }   
-        common.update_config_file(magic_pdf_config, "table-config", value)
-        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
-        common.delete_file(pdf_res_path)
-        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
-        os.system(cli_cmd)
-        res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
-        assert res is True
-    
-    
-    @pytest.mark.P1
-    def test_local_magic_pdf_doclayout_yolo(self):
-        """magic pdf cli open doclyaout yolo."""
-        time.sleep(2)
-        #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
-        #os.system(pre_cmd)
-        value = {
-        "model": "doclayout_yolo"
-        }   
-        common.update_config_file(magic_pdf_config, "layout-config", value)
-        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
-        common.delete_file(pdf_res_path)
-        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
-        os.system(cli_cmd)
-        common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
-
-    @pytest.mark.skip(reason="layoutlmv3废弃")
-    @pytest.mark.P1
-    def test_local_magic_pdf_layoutlmv3_yolo(self):
-        """magic pdf cli open layoutlmv3."""
-        time.sleep(2)
-        value = {
-        "model": "layoutlmv3"
-        }   
-        common.update_config_file(magic_pdf_config, "layout-config", value)
-        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
-        common.delete_file(pdf_res_path)
-        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
-        os.system(cli_cmd)
-        common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
-        #res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
-
-    @pytest.mark.P1
-    def test_magic_pdf_cpu(self):
-        """magic pdf cli cpu mode."""
-        time.sleep(2)
-        #pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
-        #os.system(pre_cmd)
-        value = {
-        "model": "rapid_table",
-        "enable": True,
-        "sub_model": "slanet_plus",
-        "max_time": 400
-        }   
-        common.update_config_file(magic_pdf_config, "table-config", value)
-        common.update_config_file(magic_pdf_config, "device-mode", "cpu")
-        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
-        common.delete_file(pdf_res_path)
-        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
-        os.system(cli_cmd)
-        common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
-
-
-    @pytest.mark.P1
-    def test_local_magic_pdf_close_html_table(self):
-        """magic pdf cli close table."""
-        time.sleep(2)
-        #pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
-        #os.system(pre_cmd)
-        value = {
-        "model": "rapid_table",
-        "enable": False,
-        "sub_model": "slanet_plus",
-        "max_time": 400
-        }   
-        common.update_config_file(magic_pdf_config, "table-config", value)
-        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
-        common.delete_file(pdf_res_path)
-        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
-        os.system(cli_cmd)
-        res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
-        assert res is True
-    
-
- 
-if __name__ == '__main__':
-    pytest.main()
-
--- a/tests/unittest/pdfs/test.pdf
+++ b/tests/unittest/pdfs/test.pdf
--- a/tests/unittest/test_data/__init__.py
+++ b/tests/unittest/test_data/__init__.py
--- a/tests/unittest/test_data/assets/jsonl/test_01.jsonl
+++ b/tests/unittest/test_data/assets/jsonl/test_01.jsonl
-{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
--- a/tests/unittest/test_data/assets/jsonl/test_02.jsonl
+++ b/tests/unittest/test_data/assets/jsonl/test_02.jsonl
-{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/unittest/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
--- a/tests/unittest/test_data/assets/pdfs/test_01.pdf
+++ b/tests/unittest/test_data/assets/pdfs/test_01.pdf
--- a/tests/unittest/test_data/assets/pdfs/test_02.pdf
+++ b/tests/unittest/test_data/assets/pdfs/test_02.pdf
--- a/tests/unittest/test_data/assets/pngs/test_01.png
+++ b/tests/unittest/test_data/assets/pngs/test_01.png
--- a/tests/unittest/test_data/assets/pngs/test_02.png
+++ b/tests/unittest/test_data/assets/pngs/test_02.png
--- a/tests/unittest/test_data/data_reader_writer/__init__.py
+++ b/tests/unittest/test_data/data_reader_writer/__init__.py
--- a/tests/unittest/test_data/data_reader_writer/test_filebase.py
+++ b/tests/unittest/test_data/data_reader_writer/test_filebase.py
-import os
-import shutil
-
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               FileBasedDataWriter)
-
-
-def test_filebased_reader_writer():
-
-    unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
-    sub_dir = os.path.join(unitest_dir, 'sub')
-    abs_fn = os.path.join(unitest_dir, 'abspath.txt')
-
-    os.makedirs(sub_dir, exist_ok=True)
-
-    writer = FileBasedDataWriter(sub_dir)
-    reader = FileBasedDataReader(sub_dir)
-
-    writer.write('test.txt', b'hello world')
-    assert reader.read('test.txt') == b'hello world'
-
-    writer.write(abs_fn, b'hello world')
-    assert reader.read(abs_fn) == b'hello world'
-    shutil.rmtree(unitest_dir)
--- a/tests/unittest/test_data/data_reader_writer/test_multi_bucket_s3.py
+++ b/tests/unittest/test_data/data_reader_writer/test_multi_bucket_s3.py
-import json
-import os
-
-import fitz
-import pytest
-
-from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
-                                               MultiBucketS3DataWriter)
-from magic_pdf.data.schemas import S3Config
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
-)
-def test_multi_bucket_s3_reader_writer():
-    """test multi bucket s3 reader writer must config s3 config in the
-    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
-    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
-
-    export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
-    """
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-
-    bucket_2 = os.getenv('S3_BUCKET_2', '')
-    ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
-    sk_2 = os.getenv('S3_SECRET_KEY_2', '')
-    endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
-
-    s3configs = [
-        S3Config(
-            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        ),
-    ]
-
-    reader = MultiBucketS3DataReader(bucket, s3configs)
-    writer = MultiBucketS3DataWriter(bucket, s3configs)
-
-    bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
-
-    assert bits == reader.read(
-        f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
-    )
-
-    bits = reader.read(
-        f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
-    )
-    docs = fitz.open('pdf', bits)
-    assert len(docs) == 10
-
-    bits = reader.read(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
-    )
-    assert bits == reader.read_at(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
-    )
-    assert len(json.loads(bits)) > 0
-
-    writer.write_string(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
-    )
-
-    assert 'abc'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
-    )
-
-    writer.write(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
-        '123'.encode(),
-    )
-
-    assert '123'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
-    )
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
-)
-def test_multi_bucket_s3_reader_writer_with_prefix():
-    """test multi bucket s3 reader writer must config s3 config in the
-    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
-    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
-
-    export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
-    """
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-
-    bucket_2 = os.getenv('S3_BUCKET_2', '')
-    ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
-    sk_2 = os.getenv('S3_SECRET_KEY_2', '')
-    endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
-
-    s3configs = [
-        S3Config(
-            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        ),
-    ]
-
-    prefix = 'meta-index'
-    reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
-    writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
-
-    bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
-
-    assert bits == reader.read(
-        f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
-    )
-
-    bits = reader.read(
-        f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
-    )
-    docs = fitz.open('pdf', bits)
-    assert len(docs) == 10
-
-    bits = reader.read(
-        'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
-    )
-    assert bits == reader.read_at(
-        'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
-    )
-    assert len(json.loads(bits)) > 0
-
-    writer.write_string(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
-    )
-
-    assert 'abc'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
-    )
-
-    assert 'abc'.encode() == reader.read(
-        f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
-    )
-
-    writer.write(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
-        '123'.encode(),
-    )
-
-    assert '123'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
-    )
--- a/tests/unittest/test_data/data_reader_writer/test_s3.py
+++ b/tests/unittest/test_data/data_reader_writer/test_s3.py
-import json
-import os
-
-import pytest
-
-from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
-)
-def test_s3_reader_writer():
-    """test multi bucket s3 reader writer must config s3 config in the
-    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
-    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-
-    reader = S3DataReader('', bucket, ak, sk, endpoint_url)
-    writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
-
-    bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
-
-    assert bits == reader.read(
-        f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
-    )
-
-    bits = reader.read(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
-    )
-    assert bits == reader.read_at(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
-    )
-    assert len(json.loads(bits)) > 0
-
-    writer.write_string(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
-    )
-
-    assert 'abc'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
-    )
-
-    writer.write(
-        f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
-        '123'.encode(),
-    )
-
-    assert '123'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
-    )
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
-)
-def test_s3_reader_writer_with_prefix():
-    """test multi bucket s3 reader writer must config s3 config in the
-    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
-    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-
-    prefix = 'meta-index'
-
-    reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
-    writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
-
-    bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
-
-    assert bits == reader.read(
-        f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
-    )
-
-    bits = reader.read(
-        'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
-    )
-    assert bits == reader.read_at(
-        'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
-    )
-    assert len(json.loads(bits)) > 0
-
-    writer.write_string(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
-    )
-
-    assert 'abc'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
-    )
-
-    assert 'abc'.encode() == reader.read(
-        f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
-    )
-
-    writer.write(
-        f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
-        '123'.encode(),
-    )
-
-    assert '123'.encode() == reader.read(
-        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
-    )
--- a/tests/unittest/test_data/io/__init__.py
+++ b/tests/unittest/test_data/io/__init__.py
--- a/tests/unittest/test_data/io/test_s3.py
+++ b/tests/unittest/test_data/io/test_s3.py
-import json
-import os
-
-import pytest
-
-from magic_pdf.data.io.s3 import S3Reader, S3Writer
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
-)
-def test_s3_reader():
-    """test s3 reader.
-
-    must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
-    export S3_ENDPOINT=xxx
-    """
-
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-    reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
-    bits = reader.read(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
-    )
-    assert len(bits) > 0
-
-    bits = reader.read_at(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
-        566,
-        713,
-    )
-    assert len(json.loads(bits)) > 0
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
-)
-def test_s3_writer():
-    """test s3 reader.
-
-    must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
-    export S3_ENDPOINT=xxx
-    """
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-    writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
-    test_fn = 'unittest/io/test.jsonl'
-    writer.write(test_fn, '123'.encode())
-    reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
-    bits = reader.read(test_fn)
-    assert bits.decode() == '123'
--- a/tests/unittest/test_data/test_dataset.py
+++ b/tests/unittest/test_data/test_dataset.py
-
-from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
-
-
-def test_pymudataset():
-    with open('tests/unittest/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
-        bits = f.read()
-    datasets = PymuDocDataset(bits)
-    assert len(datasets) > 0
-    assert datasets.get_page(0).get_page_info().h > 100
-
-
-def test_imagedataset():
-    with open('tests/unittest/test_data/assets/pngs/test_01.png', 'rb') as f:
-        bits = f.read()
-    datasets = ImageDataset(bits)
-    assert len(datasets) == 1
-    assert datasets.get_page(0).get_page_info().w > 100
--- a/tests/unittest/test_data/test_json_compressor.py
+++ b/tests/unittest/test_data/test_json_compressor.py
-import pytest
-import json
-from magic_pdf.libs.json_compressor import JsonCompressor
-
-# Test data fixtures
-@pytest.fixture
-def test_cases():
-    return [
-        # Simple dictionary
-        {"name": "John", "age": 30},
-        
-        # Nested dictionary
-        {
-            "person": {
-                "name": "Alice",
-                "address": {
-                    "street": "123 Main St",
-                    "city": "New York"
-                }
-            }
-        },
-        
-        # List of dictionaries
-        [
-            {"id": 1, "value": "first"},
-            {"id": 2, "value": "second"}
-        ],
-        
-        # Dictionary with various data types
-        {
-            "string": "hello",
-            "integer": 42,
-            "float": 3.14,
-            "boolean": True,
-            "null": None,
-            "array": [1, 2, 3],
-            "nested": {"key": "value"}
-        },
-        
-        # Empty structures
-        {},
-        [],
-        {"empty_list": [], "empty_dict": {}}
-    ]
-
-@pytest.fixture
-def large_data():
-    return {
-        "data": ["test" * 100] * 100  # Create a large repeated string
-    }
-
-def test_compression_decompression_cycle(test_cases):
-    """Test that data remains intact after compression and decompression"""
-    for test_data in test_cases:
-        # Compress the data
-        compressed = JsonCompressor.compress_json(test_data)
-        
-        # Verify compressed string is not empty and is a string
-        assert isinstance(compressed, str)
-        assert len(compressed) > 0
-        
-        # Decompress the data
-        decompressed = JsonCompressor.decompress_json(compressed)
-        
-        # Verify the decompressed data matches original
-        assert test_data == decompressed
-
-def test_compression_reduces_size(large_data):
-    """Test that compression actually reduces data size for large enough input"""
-    original_size = len(json.dumps(large_data))
-    compressed = JsonCompressor.compress_json(large_data)
-    compressed_size = len(compressed)
-    
-    # Verify compression actually saved space
-    assert compressed_size < original_size
-
-def test_invalid_json_serializable():
-    """Test handling of non-JSON serializable input"""
-    with pytest.raises(TypeError):
-        JsonCompressor.compress_json(set([1, 2, 3]))  # sets are not JSON serializable
-
-def test_invalid_compressed_string():
-    """Test handling of invalid compressed string"""
-    with pytest.raises(Exception):
-        JsonCompressor.decompress_json("invalid_base64_string")
-
-def test_empty_string_input():
-    """Test handling of empty string input"""
-    with pytest.raises(Exception):
-        JsonCompressor.decompress_json("")
-
-def test_special_characters():
-    """Test handling of special characters"""
-    test_data = {
-        "special": "!@#$%^&*()_+-=[]{}|;:,.<>?",
-        "unicode": "Hello 世界 🌍"
-    }
-    
-    compressed = JsonCompressor.compress_json(test_data)
-    decompressed = JsonCompressor.decompress_json(compressed)
-    assert test_data == decompressed
-
-# Parametrized test for different types of input
-@pytest.mark.parametrize("test_input", [
-    {"simple": "value"},
-    [1, 2, 3],
-    {"nested": {"key": "value"}},
-    ["mixed", 1, True, None],
-    {"unicode": "🌍"}
-])
-def test_various_input_types(test_input):
-    """Test compression and decompression with various input types"""
-    compressed = JsonCompressor.compress_json(test_input)
-    decompressed = JsonCompressor.decompress_json(compressed)
-    assert test_input == decompressed
\ No newline at end of file