add ci

80e7a50e · quyuan · 2e79da59 · 80e7a50e · 2e79da59 · 2e79da59
Commit 80e7a50e authored Jul 13, 2024 by quyuan
20 changed files
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -35,6 +35,5 @@ jobs:
    - name: get-benchmark-result
      run: |
        echo "start test"
-        cd $GITHUB_WORKSPACE/tests/benchmark/ 
+        cd $GITHUB_WORKSPACE &&  pytest -s -v tests/test_cli/test_ben.py
-        tree
--- a/tests/benchmark/benchmark.py
+++ b/tests/benchmark/benchmark.py
-"""
-bench
-"""
-import os
-import shutil
-import json
-import calculate_score
-code_path = os.environ.get('GITHUB_WORKSPACE')
-#评测集存放路径
-pdf_dev_path = "datasets/"
-#magicpdf跑测结果
-pdf_res_path = "/tmp/magic-pdf"
-def test_cli():
-    """
-    test pdf-command cli
-    """
-    rm_cmd = f"rm -rf {pdf_res_path}"
-    os.system(rm_cmd)
-    os.makedirs(pdf_res_path)
-    cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}'
-    os.system(cmd)
-    for root, dirs, files in os.walk(pdf_res_path):
-         for magic_file in files:
-            target_dir = os.path.join(pdf_dev_path, "mineru")
-            if magic_file.endswith(".md"):
-                source_file = os.path.join(root, magic_file)
-                target_file = os.path.join(pdf_dev_path, "mineru", magic_file)
-                if not os.path.exists(target_dir):
-                    os.makedirs(target_dir) 
-                shutil.copy(source_file, target_file)
-def get_score():
-    """
-    get score
-    """
-    data_path = os.path.join(pdf_dev_path, "ci")
-    score = calculate_score.Scoring(os.path.join(data_path, "result.json"))
-    score.calculate_similarity_total("mineru", data_path)
-    res = score.summary_scores()
-    return res
-def ci_ben():
-    """
-    ci benchmark
-    """
-    try:
-        fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
-        lines = fr.readlines()
-        last_line = lines[-1].strip()
-        last_score = json.loads(last_line)
-        print ("last_score:", last_score)
-        last_simscore = last_score["average_sim_score"]
-        last_editdistance = last_score["average_edit_distance"]
-        last_bleu = last_score["average_bleu_score"]
-    except IOError:
-        print ("result.json not exist")
-    test_cli()
-    os.system(f"python pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
-    now_score = get_score()
-    print ("now_score:", now_score)
-    now_simscore = now_score["average_sim_score"]
-    now_editdistance = now_score["average_edit_distance"]
-    now_bleu = now_score["average_bleu_score"]
-    assert last_simscore <= now_simscore
-    assert last_editdistance <= now_editdistance
-    assert last_bleu <= now_bleu
-if __name__ == "__main__":
-    os.system("sh env.sh")
-    ci_ben()
--- a/tests/benchmark/calculate_score.py
+++ b/tests/benchmark/calculate_score.py
-"""
-calculate_score
-"""
-import os
-import re
-import json
-import scoring
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-from nltk.tokenize import word_tokenize
-from Levenshtein import distance
-class Scoring:
-    """
-    calculate_score 
-    """
-    def __init__(self, result_path):
-        """
-        init
-        """
-        self.edit_distances = []
-        self.bleu_scores = []
-        self.sim_scores = []
-        self.filenames = []
-        self.score_dict = {}
-        self.anntion_cnt = 0
-        self.fw = open(result_path, "w+", encoding='utf-8')
-    def simple_bleu_score(self, candidate, reference):
-        """
-        get bleu score
-        """
-        candidate_tokens = word_tokenize(candidate)
-        reference_tokens = word_tokenize(reference)
-        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
-    def preprocess_string(self, s):
-        """
-        preprocess_string
-        """
-        sub_enter = re.sub(r'\n+', '\n', s)
-        return re.sub(r'  ', ' ', sub_enter)
-    def calculate_similarity(self, annotion, actual, tool_type):
-        """
-        calculate_similarity
-        """
-        class_dict = {}
-        edit_distances = []
-        bleu_scores = []
-        sim_scores = list()
-        total_file = 0
-        for filename in os.listdir(annotion):
-            if filename.endswith('.md') and not filename.startswith('.'):
-                total_file = total_file + 1
-                with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
-                    content_a = file_a.read()
-                self.anntion_cnt = self.anntion_cnt + 1
-                filepath_b = os.path.join(actual, filename)
-                if os.path.exists(filepath_b):
-                    with open(filepath_b, 'r', encoding='utf-8') as file_b:
-                        content_b = file_b.read()
-                        self.filenames.append(filename)
-                        edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
-                        self.edit_distances.append(edit_dist)
-                        edit_distances.append(edit_dist)
-                        bleu_score = self.simple_bleu_score(content_b, content_a)
-                        bleu_scores.append(bleu_score)
-                        self.bleu_scores.append(bleu_score)
-                        score = scoring.score_text(content_b, content_a)
-                        sim_scores.append(score)
-                        self.sim_scores.append(score)
-                        class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
-                        self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
-                else:  
-                    print(f"File {filename} not found in actual directory.")
-        class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
-        class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
-        class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
-        self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
-        ratio = len(class_dict)/total_file
-        self.fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
-        self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
-        self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
-        self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
-        print (f"{tool_type} extract ratio: {ratio}")
-        print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
-        print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
-        print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
-        return self.score_dict
-    def summary_scores(self):
-        """
-        calculate the average of edit distance, bleu score and sim score
-        """
-        over_all_dict = dict()
-        average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0  
-        average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0  
-        average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
-        over_all_dict["average_edit_distance"] = average_edit_distance
-        over_all_dict["average_bleu_score"] = average_bleu_score
-        over_all_dict["average_sim_score"] = average_sim_score
-        self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
-        return over_all_dict
-    def calculate_similarity_total(self, tool_type, download_dir):
-        """
-        calculate the average of edit distance, bleu score and sim score
-        """
-        annotion = os.path.join(download_dir, "annotations", "cleaned")
-        actual = os.path.join(download_dir, tool_type, "cleaned")
-        score = self.calculate_similarity(annotion, actual, tool_type)
-        return score
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_0b2c9c91f5232541a7ace8984df306b2.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_0b2c9c91f5232541a7ace8984df306b2.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_f7904bc37cc2e25c1e3e412978854b10.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_f7904bc37cc2e25c1e3e412978854b10.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_fbdb99151e811688574c0c4c67341074.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_fbdb99151e811688574c0c4c67341074.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_ordinary_textbook_1d9a847603a5e37e379738316820850d.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_ordinary_textbook_1d9a847603a5e37e379738316820850d.md
-# 数学新星问题征解 
-第十五期 (2016.06)
-主持: 牟晓生
-第一题. 设 $z_{1}, z_{2}, z_{3}$ 是单位复数. 证明存在单位复数 $z$ 使得:
-$$
-\frac{1}{\left|z-z_{1}\right|^{2}}+\frac{1}{\left|z-z_{2}\right|^{2}}+\frac{1}{\left|z-z_{3}\right|^{2}} \leq \frac{9}{4}
-$$
-(湖北武钢三中学生 王逸轩, 上海大学冷岗松 供题)
-第二题. 如图, $D$ 是正三角形 $A B C$ 的边 $B C$ 上一点, $B D>C D$. 记 $O_{1}, I_{1}$ 为 $\triangle A B D$ 的外心与内心, $O_{2}, I_{2}$ 为 $\triangle A C D$ 的外心与内心. 圆 $I_{1}$ 与圆 $I_{2}$ 除 $B C$外的另一条外公切线交 $A B, A C$ 于 $P, Q$. 设直线 $P I_{1}$与 $Q I_{2}$ 交于 $R$, 而直线 $O_{1} I_{1}$ 与 $O_{2} I_{2}$ 交于 $T$. 证明: $A T^{2}=A R^{2}+A D \cdot B C$.
-(广西钦州 卢圣 供题)
-第三题. 给定正整数 $m, n$, 考虑在 $m \times n$ 白棋盘上先将一些格染成黑色. 在之后的每一时刻, 若存在一个白格至少与两个黑格相邻, 则可将它也染成黑色. 求最初至少要染多少个黑色格才能在某一时刻染黑整个棋盘?
-(哈佛大学 牟晓生 供题)
-第四题. $A B C$ 是一个三角形, 而 $P, Q, R$ 分别是 $B C, C A, A B$ 上的点。证明 $\triangle P Q R$ 的周长不小于 $\triangle A Q R, \triangle B R P, \triangle C P Q$ 周长的最小值.
-(哈佛大学 牟晓生 供题)
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_research_report_1f978cd81fb7260c8f7644039ec2c054.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_research_report_1f978cd81fb7260c8f7644039ec2c054.md
-## 增持（维持）
-所属行业：机械设备
-当前价格(元): 82.42
-## 证券分析师
-倪正洋
-资格编号：S0120521020003
-邮箱: nizy@tebon.com.cn
-## 研究助理
-杨云道
-邮箱: yangyx@tebon.com.cn
-| 沪深 300 对比 | $1 \mathrm{M}$ | $2 \mathrm{M}$ | $3 \mathrm{M}$ |
-| :--- | ---: | ---: | ---: |
-| 绝对涨幅(\%) | 7.18 | 32.88 | 80.86 |
-| 相对涨幅(\%) | 8.10 | 25.93 | 78.39 |
-资料来源: 德邦研究所, 聚源数据
-## 相关研究
-1.《高测股份 (688556): 光伏金刚线及硅片切割代工业务推动公司 22Q1 业绩大超预期》, 2022.4.29
-2.《光伏设备: 光伏高效电池扩产提速,关键设备商各领风骚》, 2022.4.10 3. 《高测股份 (688556.SH): 再签建湖 10GW 硅片切割代工产能，强化代工业务成长逻辑》, 2022.4.7
-3.《高测股份 (688556.SH): 签订晶澳曲靖 2.2 亿元切割设备合同，看好 22 年代工业绩释放+HJT 切割工艺进步》, 2022.3.9
-4.《高测股份 (688556.SH): 21 年业绩预告超市场预期，关注切片代工利润释放》, 2022.1.24
-# 高测股份 $(688556.5 H):$ 扩产 4000 万公里金刚线，强化光伏碰片切割三元布局
-## 投资要点
- 事件：公司拟与蓝关县人民政府签署的《壶关年产 12000 万千米金刚线项目投资协议书》，项目一期计划建设年产 4,000万千米金刚线产能，预计一期总投资额约 6.66 亿元; 后续年产 8,000 万千米金刚线项目尚未具体约定，存在较大不确定性。
- 顺应下游需求扩张, 金刚线产能快速扩产, 保证公司内供+外销。光伏金刚线需求 22 年提升源于两方面：1）2022 年光伏产业链景气度高涨，1-5 月光伏装机同比 $+24.4 \%$, 带动产业链各环节开工率提升, 硅片前期扩产产能逐步落地, 金刚线需求释放；2）由于多晶硅料价格持续维持高位，细线化、薄片化趋势加速，其中细线化要求金刚线线径由 40 线、 38 线向 36 线、 35 线进步, 带动单 GW 切割线耗不断提升。目前 36 线单 GW 切割线耗约 50 万公里, 较 38 线提升约 $30 \%$ 。公司于 2021 年对金刚线进行 “ 1 机 12 线” 技改，技改完成后，公司 22 年 1 季度产能 712 万公里, 年化产能超 2500 万公里。公司目前切片代工产能约 47GW, 对应远期金刚线产能超 2300 万公里。本次扩产再一次扩充公司金刚线产能, 强化金刚线产能内供+外销布局。
- 依托萦关低成本电价提升金刚线盈利能力, 顺应硅料节约持续推动细线化布局。公司在山西长治金刚线生产厂区采购电力的平均单价较青岛金刚线生产厂区采购电力的平均单价低, 2020 年度公司陆续将青岛的金刚线生产线搬迁到山西长治並关厂区，随着山西长治金刚线生产厂区金刚线产量增加，公司采购电力的平均单价呈下降趋势。目前公司电力采购单价从 2019 年 0.8 元/kwh 降低到 2022 年 Q1 的 0.39 元/kwh，並关后续拓展有望进一步降低公司金刚线电价成本。金刚线线径越细，锯㖓越小，切割时产生的锯㖓硅料损失越少，同样一根硅棒可切割加工出的硅片数量越多，制造硅片所需的硅材料越少。相同切割工艺下，金刚线越细，固结在钢线基体上的金刚石微粉颗粒越小，切割加工时对硅片的表面损伤越小，硅片表面质量越好，砝片 TTV 等质量指标表现也就越好。金刚线母线直径已由 2016 年的 80um 降至 2022 年上半年的 36、38、40um，此外高线速、柔性化和智能化等均是金刚线及切片技术进步方向, 公司在薄片、细线化、高线速、柔性智能化方面均有领先布局, 推动切割工艺持续进步。
- 切割工艺的持续进步领先, 是保障公司利润释放的核心壁垒。公司光伏硅片切割三元布局包括硅片切割及机加工设备、砝片切割耗材 (金刚线) 以及切割代工业务。公司 2021 年依托前期设备+耗材布局切割代工业务, 目前已公布 47GW 产能 (乐山5GW 示范基地、乐山 20GW 大硅片及配套项目、建湖一期 10GW 项目,建湖二期 $12 \mathrm{GW}$ 项目), 客户包括通威、京运通、美科及建湖周边电池企业。22 年底公司有望实现超 20GW 切割代工产能, 且当前终端客户主要为下游电池企业。客户选择切割代工模式的核心在于凭借高测的专业化服务实现快速上产, 同时可获得较自建硅片切割产能或购买硅片更多的超额利润。超额利润的核心在于高测股份的切割代工技术领先, 可实现更多的硅片切割红利, 并与客户共享。未来随着金刚线扩产和切割技术进步, 公司光伏硅片切割代工利润弹性有望持续释放。
- 盈利预测与投资建议：预计公司 2022-2024 年归母净利润 4.7、7.2、9.3 亿元,对应 PE 30、20、15 倍，维持 “增持” 评级。
- 风险提示：硅片扩产不及预期，公司代工业务利润波动风险，市场竞争加剧。
-<table><thead><tr><th>股票数据</th><th></th></tr></thead><tr><td>总股本(百万股):</td><td>227.92</td></tr><tr><td>流通 A 股(百万股):</td><td>167.01</td></tr><tr><td>52 周内股价区间(元):</td><td>21.60-97.40</td></tr><tr><td>总市值(百万元):</td><td>18,785.44</td></tr><tr><td>总资产(百万元):</td><td>3,508.81</td></tr><tr><td>每股净资产(元):</td><td>5.50</td></tr><tr><td>咨料来源，公司公告</td><td></td></tr></table>
-<table><thead><tr><th>主要财务数据及预测</th><th></th><th></th><th></th><th></th><th></th></tr></thead><tr><td></td><td>2020</td><td>2021</td><td>2022E</td><td>2023E</td><td>2024E</td></tr><tr><td>营业收入(百万元)</td><td>746</td><td>1,567</td><td>3,684</td><td>5,056</td><td>5,752</td></tr><tr><td>(+/-)YOY(%)</td><td>4.5\%</td><td>110.0\%</td><td>135.1\%</td><td>37.2\%</td><td>13.8\%</td></tr><tr><td>净利润(百万元)</td><td>59</td><td>173</td><td>471</td><td>717</td><td>933</td></tr><tr><td>(+/-)YOY(%)</td><td>83.8\%</td><td>193.4\%</td><td>172.8\%</td><td>52.2\%</td><td>30.1\%</td></tr><tr><td>全面摊薄 EPS(元)</td><td>0.43</td><td>1.07</td><td>2.91</td><td>4.43</td><td>5.77</td></tr><tr><td>毛利率(\%)</td><td>35.3\%</td><td>33.7\%</td><td>35.0\%</td><td>36.0\%</td><td>38.0\%</td></tr><tr><td>净资产收益率(\%)</td><td>6.0\%</td><td>15.0\%</td><td>27.9\%</td><td>28.8\%</td><td>26.5\%</td></tr></table>
-资料来源: 公司年报 (2020-2021)，德邦研究所
-备注: 净利润为归属母公司所有者的净利润
-## 财务报表分析和预测
-| 主要财务指标 | 2021 | $2022 E$ | $2023 E$ | $2024 E$ |
-| :--- | ---: | ---: | ---: | ---: |
-| 每股指标(元) |  |  |  |  |
-| 每股收益 | 1.07 | 2.91 | 4.43 | 5.77 |
-| 每股净资产 | 7.13 | 10.43 | 15.39 | 21.76 |
-| 每股经营现金流 | 0.47 | 1.27 | 4.07 | 5.02 |
-| 每股股利 | 0.11 | 0.11 | 0.11 | 0.11 |
-| 价值评估(倍) |  |  |  |  |
-| P/E | 82.90 | 30.47 | 20.02 | 15.38 |
-| P/B | 12.44 | 8.50 | 5.76 | 4.08 |
-| P/S | 8.52 | 3.62 | 2.64 | 2.32 |
-| EV/EBITDA | 49.85 | 24.12 | 15.68 | 11.46 |
-| 股息率\% | $0.1 \%$ | $0.1 \%$ | $0.1 \%$ | $0.1 \%$ |
-| 盈利能力指标(\%) |  |  |  |  |
-| 毛利率 | $33.7 \%$ | $35.0 \%$ | $36.0 \%$ | $38.0 \%$ |
-| 净利润率 | $11.0 \%$ | $12.8 \%$ | $14.2 \%$ | $16.2 \%$ |
-| 净资产收益率 | $15.0 \%$ | $27.9 \%$ | $28.8 \%$ | $26.5 \%$ |
-| 资产回报率 | $5.3 \%$ | $7.9 \%$ | $8.5 \%$ | $9.2 \%$ |
-| 投资回报率 | $15.3 \%$ | $25.9 \%$ | $24.6 \%$ | $23.7 \%$ |
-| 盈利增长(\%) |  |  |  |  |
-| 营业收入增长率 | $110.0 \%$ | $135.1 \%$ | $37.2 \%$ | $13.8 \%$ |
-| EBIT 增长率 | $233.7 \%$ | $150.7 \%$ | $52.3 \%$ | $31.9 \%$ |
-| 净利润增长率 | $193.4 \%$ | $172.8 \%$ | $52.2 \%$ | $30.1 \%$ |
-| 偿倩能力指标 |  |  |  |  |
-| 资产负债率 | $64.3 \%$ | $71.5 \%$ | $70.6 \%$ | $65.3 \%$ |
-| 流动比率 | 1.2 | 1.2 | 1.3 | 1.4 |
-| 速动比率 | 0.9 | 0.9 | 1.0 | 1.1 |
-| 现金比率 | 0.2 | 0.1 | 0.2 | 0.3 |
-| 经营效率指标 |  |  |  |  |
-| 应收怅款周转天数 | 161.7 | 165.1 | 164.9 | 164.4 |
-| 存货周转天数 | 196.1 | 170.0 | 180.0 | 190.0 |
-| 总资产周转率 | 0.5 | 0.6 | 0.6 | 0.6 |
-| 固定资产周转率 | 4.2 | 8.6 | 10.3 | 11.1 |
-| 现金流量表(百万元) | 2021 | $2022 E$ | 2023E | 2024E |
-| :--- | ---: | ---: | ---: | ---: |
-| 净利润 | 173 | 471 | 717 | 933 |
-| 少数股东损益 | 0 | 0 | 0 | 0 |
-| 非现金支出 | 107 | 114 | 133 | 147 |
-| 非经营收益 | 17 | 1 | 4 | 14 |
-| 营运资金变动 | -220 | -382 | -195 | -283 |
-| 经营活动现金流 | 76 | 205 | 658 | 812 |
-| 资产 | -83 | -184 | -203 | -169 |
-| 投资 | 229 | 0 | 0 | 0 |
-| 其他 | 6 | 9 | 13 | 14 |
-| 投资活动现金流 | 151 | -175 | -190 | -155 |
-| 债权募资 | -80 | 39 | 321 | 64 |
-| 股权募资 | 0 | 0 | 0 | 0 |
-| 其他活 | -21 | -3 | -14 | -25 |
-| 融资活动现金流 | -101 | 36 | 307 | 39 |
-| 现金净流量 | 127 | 66 | 775 | 696 |
-备注: 表中计算估值指标的收盘价日期为 7 月 19 日
-资料来源: 公司年报 (2020-2021), 德邦研究所
-| 利润表(百万元) | 2021 | 2022E | 2023E | 2024E |
-| :---: | :---: | :---: | :---: | :---: |
-| 营业总收入 | 1,567 | 3,684 | 5,056 | 5,752 |
-| 营业成本 | 1,038 | 2,394 | 3,236 | 3,567 |
-| 毛利率\% | $33.7 \%$ | $35.0 \%$ | $36.0 \%$ | $38.0 \%$ |
-| 营业税金及附加 | 6 | 18 | 25 | 29 |
-| 营业税金率\% | $0.4 \%$ | $0.5 \%$ | $0.5 \%$ | $0.5 \%$ |
-| 营业费用 | 63 | 147 | 193 | 209 |
-| 营业费用率\% | $4.0 \%$ | $4.0 \%$ | $3.8 \%$ | $3.6 \%$ |
-| 管理费用 | 131 | 313 | 409 | 444 |
-| 管理费用率\% | $8.4 \%$ | $8.5 \%$ | $8.1 \%$ | $7.7 \%$ |
-| 研发费用 | 117 | 276 | 379 | 431 |
-| 研发费用率\% | $7.5 \%$ | $7.5 \%$ | $7.5 \%$ | $7.5 \%$ |
-| EBIT | 213 | 534 | 814 | 1,074 |
-| 财务费用 | 7 | 1 | 11 | 19 |
-| 财务费用率\% | $0.4 \%$ | $0.0 \%$ | $0.2 \%$ | $0.3 \%$ |
-| 资产减值损失 | -33 | -63 | -86 | -98 |
-| 投资收益 | 5 | 9 | 13 | 14 |
-| 营业利润 | 212 | 531 | 800 | 1,040 |
-| 营业外收支 | -25 | -8 | -3 | -3 |
-| 利润总额 | 187 | 523 | 797 | 1,037 |
-| EBITDA | 282 | 582 | 865 | 1,129 |
-| 所得税 | 14 | 52 | 80 | 104 |
-| 有效所得税率\% | $7.7 \%$ | $10.0 \%$ | $10.0 \%$ | $10.0 \%$ |
-| 少数股东损益 | 0 | 0 | 0 | $\mathbf{0}-1-2$ |
-| 归属母公司所有者净利润 | 173 | 471 | 717 | 933 |
-| 资产负债表(百万元) | 2021 | 2022E | 2023E | $2024 E$ |
-| :---: | :---: | :---: | :---: | :---: |
-| 货币资金 | 427 | 494 | 1,269 | 1,965 |
-| 应收账款及应收票据 | 1,173 | 2,806 | 3,798 | 4,344 |
-| 存货 | 558 | 1,115 | 1,596 | 1,857 |
-| 其它流动资产 | 266 | 578 | 736 | 778 |
-| 流动资产合计 | 2,424 | 4,992 | 7,400 | 8,943 |
-| 长期股权投资 | 0 | 0 | 0 | 0 |
-| 固定资产 | 370 | 429 | 491 | 516 |
-| 在建工程 | 169 | 183 | 205 | 226 |
-| 无形资产 | 42 | 56 | 69 | 80 |
-| 非流动资产合计 | 811 | 940 | 1,087 | 1,198 |
-| 资产总计 | 3,235 | 5,932 | 8,487 | 10,141 |
-| 短期借款 | 28 | 68 | 388 | 452 |
-| 应付票据及应付账款 | 1,401 | 3,197 | 4,302 | 4,760 |
-| 预收账款 | 0 | 0 | 0 | 0 |
-| 其它流动负债 | 560 | 887 | 1,214 | 1,314 |
-| 流动负债合计 | 1,989 | 4,152 | 5,904 | 6,527 |
-| 长期借款 | 0 | 0 | 0 | 0 |
-| 其它长期负债 | 92 | 92 | 92 | 92 |
-| 非流动负债合计 | 92 | 92 | 92 | 92 |
-| 负债总计 | 2,081 | 4,243 | 5,996 | 6,619 |
-| 实收资本 | 162 | 162 | 162 | 162 |
-| 普通股股东权益 | 1,154 | 1,688 | 2,491 | 3,522 |
-| 少数股东权益 | 0 | 0 | 0 | 0 |
-| 负债和所有者权益合计 | 3,235 | 5,932 | 8,487 | 10,141 |
-## 信息披露
-## 分析师与研究助理简介
-倪正洋，2021 年加入德邦证券，任研究所大制造组组长、机械行业首席分析师，拥有 5 年机械研究经验，1 年高端装备产业经验，南京大学材料学学士、上海交通大学材料学硕士。2020 年获得 iFinD 机械行业最具人气分析师, 所在团队曾获机械行业 2019 年新财富第三名，2017 年新财富第二名，2017 年金牛奖第二名，2016 年新财富第四名。
-## 分析师声明
-本人具有中国证券业协会授予的证券投资咨询执业资格，以勤勉的职业态度，独立、客观地出具本报告。本报告所采用的数据和信息均来自市场公开信息, 本人不保证该等信息的准确性或完整性。分析逻辑基于作者的职业理解，清晰准确地反映了作者的研究观点，结论不受任何第三方的授意或影响，特此声明。
-## 投资评级说明
-1.投资评级的比较和评级标准:
-以报告发布后的 6 个月内的市场表现为比较标准，报告发布日后 6 个月内的公司股价（或行业指数）的张跌幅相对同期市场基准指数的涨跌幅;
-2.市场基准指数的比较标准:
-A 股市场以上证综指或深证成指为基准；香港市场以恒生指数为基准；美国市场以标普 500 或纳斯达克综合指数为基准。
-<table>
-    <tr>
-        <td rowspan="11">1. 投资评级的比较和评级标准: 以报告发布后的 6 个月内的市场表 现为比较标准，报告发布日后 6 个 月内的公司股价(或行业指数)的 涨跌幅相对同期市场基准指数的涨 跌幅：<br> 2. 市场基准指数的比较标准: A股市场以上证综指或深证成指为基 准; 香港市场以恒生指数为基准; 美 国市场以标普500或纳斯达克综合指 数为基准。</td>
-    </tr>
-    <tr>
-        <td>类型</td>
-        <td>评级</td>
-        <td>说明</td>
-    </tr>
-        <td rowspan="5">股票评级</td>
-    </tr>
-    <tr>
-        <td>买入</td>
-        <td>相对强于市场表现 20%以上;</td>
-    </tr>
-    <tr>
-        <td>增持</td>
-        <td>相对强于市场表现 5% 20%;</td>
-    </tr>
-    <tr>
-        <td>中性</td>
-        <td>相对市场表现在-5% +5%之间波动；</td>
-    </tr>
-    <tr>
-        <td>减持</td>
-        <td>相对弱于市场表现 5%以下。</td>
-    </tr>
-    <tr>
-        <td rowspan="4">行业投资评级</td>
-    </tr>
-    <tr>
-        <td>优于大市</td>
-        <td>预期行业整体回报高于基准指数整体水平10%以上;</td>
-    </tr>
-    <tr>
-        <td>中性</td>
-        <td>预期行业整体回报介于基准指数整体水平-10%与 10%之间;</td>
-    </tr>
-    <tr>
-        <td>弱于大市</td>
-        <td>预期行业整体回报低于基准指数整体水平 10%以下。</td>
-    </tr>
-    <tr>
-</table>
-## 法律声明
-本报告仅供德邦证券股份有限公司（以下简称 “本公司”）的客户使用。本公司不会因接收人收到本报告而视其为客户。在任何情况下，本报告中的信息或所表述的意见并不构成对任何人的投资建议。在任何情况下，本公司不对任何人因使用本报告中的任何内容所引致的任何损失负任何责任。
-本报告所载的资料、意见及推测仅反映本公司于发布本报告当日的判断，本报告所指的证券或投资标的的价格、价值及投资收入可能会波动。在不同时期，本公司可发出与本报告所载资料、意见及推测不一致的报告。
-市场有风险，投资需谨慎。本报告所载的信息、材料及结论只提供特定客户作参考，不构成投资建议，也没有考虑到个别客户特殊的投资目标、财务状况或需要。客户应考虑本报告中的任何意见或建议是否符合其特定状况。在法律许可的情况下，德邦证券及其所属关联机构可能会持有报告中提到的公司所发行的证券并进行交易，还可能为这些公司提供投资银行服务或其他服务。
-本报告仅向特定客户传送，未经德邦证券研究所书面授权，本研究报告的任何部分均不得以任何方式制作任何形式的拷贝、复印件或复制品，或再次分发给任何其他人，或以任何侵犯本公司版权的其他方式使用。所有本报告中使用的商标、服务标记及标记均为本公司的商标、服务标记及标记。如欲引用或转载本文内容, 务必联络德邦证券研究所并获得许可, 并需注明出处为德邦证券研究所，且不得对本文进行有悖原意的引用和删改。
-根据中国证监会核发的经营证券业务许可，德邦证券股份有限公司的经营范围包括证券投资咨询业务。
\ No newline at end of file
--- a/tests/benchmark/datasets/pdf/academic_literature_0b2c9c91f5232541a7ace8984df306b2.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_0b2c9c91f5232541a7ace8984df306b2.pdf
--- a/tests/benchmark/datasets/pdf/academic_literature_f7904bc37cc2e25c1e3e412978854b10.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_f7904bc37cc2e25c1e3e412978854b10.pdf
--- a/tests/benchmark/datasets/pdf/academic_literature_fbdb99151e811688574c0c4c67341074.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_fbdb99151e811688574c0c4c67341074.pdf
--- a/tests/benchmark/datasets/pdf/ordinary_textbook_1d9a847603a5e37e379738316820850d.pdf
+++ b/tests/benchmark/datasets/pdf/ordinary_textbook_1d9a847603a5e37e379738316820850d.pdf
--- a/tests/benchmark/datasets/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf
+++ b/tests/benchmark/datasets/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf
--- a/tests/benchmark/env.sh
+++ b/tests/benchmark/env.sh
-conda create -n MinerU python=3.10
-conda activate MinerU
-pip install magic-pdf
-#cp magic-pdf.template.json ~/magic-pdf.json
\ No newline at end of file
--- a/tests/benchmark/magic-pdf.json
+++ b/tests/benchmark/magic-pdf.json
-{
-    "bucket_info":{
-        "bucket-name-1":["ak", "sk", "endpoint"],
-        "bucket-name-2":["ak", "sk", "endpoint"]
-    },
-    "temp-output-dir":"/tmp",
-    "models-dir":"/tmp/models",
-    "device-mode":"cpu"
-}
\ No newline at end of file
--- a/tests/benchmark/pre_clean.py
+++ b/tests/benchmark/pre_clean.py
-"""
-clean data
-"""
-import argparse
-import os
-import re
-import htmltabletomd # type: ignore
-import pypandoc
-import argparse
-parser = argparse.ArgumentParser(description="get tool type")
-parser.add_argument(
-    "--tool_name",
-    type=str,
-    required=True,
-    help="input tool name",
-)
-parser.add_argument(
-    "--download_dir",
-    type=str,
-    required=True,
-    help="input download dir",
-)
-args = parser.parse_args()
-def clean_markdown_images(content):
-    """
-    clean markdown images
-    """
-    pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)  
-    cleaned_content = pattern.sub('', content)   
-    return cleaned_content
-def clean_ocrmath_photo(content):
-    """
-    clean ocrmath photo
-    """
-    pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)  
-    cleaned_content = pattern.sub('', content)   
-    return cleaned_content
-def convert_html_table_to_md(html_table):
-    """
-    convert html table to markdown table
-    """
-    lines = html_table.strip().split('\n')  
-    md_table = ''  
-    if lines and '<tr>' in lines[0]:  
-        in_thead = True  
-        for line in lines:  
-            if '<th>' in line:  
-                cells = re.findall(r'<th>(.*?)</th>', line)  
-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
-                in_thead = False  
-            elif '<td>' in line and not in_thead:  
-                cells = re.findall(r'<td>(.*?)</td>', line)  
-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
-        md_table = md_table.rstrip() + '\n'    
-    return md_table  
-def convert_latext_to_md(content):
-    """
-    convert latex table to markdown table
-    """
-    tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)  
-    placeholders = []  
-    for table in tables:  
-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
-        replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
-        content = content.replace(replace_str, placeholder)  
-        try:
-            pypandoc.convert_text(replace_str,  format="latex", to="md", outputfile="output.md", encoding="utf-8")
-        except:
-            markdown_string = replace_str
-        else: 
-            markdown_string = open('output.md', 'r', encoding='utf-8').read()
-        placeholders.append((placeholder, markdown_string)) 
-    new_content = content  
-    for placeholder, md_table in placeholders:  
-        new_content = new_content.replace(placeholder, md_table)  
-        # 写入文件  
-    return new_content
-def convert_htmltale_to_md(content):
-    """
-    convert html table to markdown table
-    """
-    tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)  
-    placeholders = []
-    for table in tables:
-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
-        content = content.replace(f"<table>{table}</table>", placeholder)  
-        try:
-            convert_table = htmltabletomd.convert_table(table)
-        except:
-            convert_table = table
-        placeholders.append((placeholder,convert_table)) 
-    new_content = content  
-    for placeholder, md_table in placeholders:  
-        new_content = new_content.replace(placeholder, md_table)  
-        # 写入文件  
-    return new_content
-def clean_data(prod_type, download_dir):
-    """
-    clean data
-    """
-    tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
-    if not os.path.exists(tgt_dir):  
-        os.makedirs(tgt_dir) 
-    source_dir = os.path.join(download_dir, prod_type)
-    filenames = os.listdir(source_dir)
-    for filename in filenames:
-        if filename.endswith('.md'):
-            input_file = os.path.join(source_dir, filename)
-            output_file = os.path.join(tgt_dir, "cleaned_" + filename)
-            with open(input_file, 'r', encoding='utf-8') as fr:
-                content = fr.read()
-                new_content = clean_markdown_images(content)
-                new_content = convert_html_table_to_md(new_content)
-                new_content = convert_latext_to_md(new_content)
-                new_content = convert_htmltale_to_md(new_content)
-                with open(output_file, 'w', encoding='utf-8') as fw:
-                    fw.write(new_content)
-if __name__ == '__main__':
-    tool_type = args.tool_name
-    download_dir = args.download_dir
-    clean_data(tool_type, download_dir)
--- a/tests/benchmark/result.json
+++ b/tests/benchmark/result.json
-{"average_sim_score":0, "average_edit_distance":0, "average_bleu_score": 0}
\ No newline at end of file
--- a/tests/benchmark/scoring.py
+++ b/tests/benchmark/scoring.py
-import math
-from rapidfuzz import fuzz
-import re
-import regex
-from statistics import mean
-CHUNK_MIN_CHARS = 25
-def chunk_text(text, chunk_len=500):
-    chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
-    chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
-    return chunks
-def overlap_score(hypothesis_chunks, reference_chunks):
-    if len(reference_chunks) > 0:
-        length_modifier = len(hypothesis_chunks) / len(reference_chunks)
-    else:
-        length_modifier = 0
-    search_distance = max(len(reference_chunks) // 5, 10)
-    chunk_scores = []
-    for i, hyp_chunk in enumerate(hypothesis_chunks):
-        max_score = 0
-        total_len = 0
-        i_offset = int(i * length_modifier)
-        chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
-        for j in chunk_range:
-            ref_chunk = reference_chunks[j]
-            score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
-            if score > max_score:
-                max_score = score
-                total_len = len(ref_chunk)
-        chunk_scores.append(max_score)
-    return chunk_scores
-def score_text(hypothesis, reference):
-    # Returns a 0-1 alignment score
-    hypothesis_chunks = chunk_text(hypothesis)
-    reference_chunks = chunk_text(reference)
-    chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
-    if len(chunk_scores) > 0:
-        mean_score = mean(chunk_scores)
-        return mean_score
-    else:
-        return 0
-    #return mean(chunk_scores)
\ No newline at end of file
--- a/tests/test_cli/conf/conf.py
+++ b/tests/test_cli/conf/conf.py
@@ -2,6 +2,6 @@ import os
 conf = {
 "code_path": os.environ.get('GITHUB_WORKSPACE'),
 "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
-"pdf_res_path": "/tmp"
+"pdf_res_path": "/tmp/magic-pdf"
 }
--- a/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json
+++ b/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json