Merge pull request #913 from DTwz/dev

Modify the test directory

Merge pull request #913 from DTwz/dev
Modify the test directory
b912797a · Xiaomeng Zhao · GitHub · 1e37e199 · a09d9e8c · 1e37e199
Unverified Commit b912797a authored Nov 08, 2024 by Xiaomeng Zhao Committed by GitHub Nov 08, 2024
20 changed files
--- a/tests/magic-pdf.json
+++ b/tests/magic-pdf.json
-{       
-    "temp-output-dir": "/tmp/"
-}
--- a/tests/overall_indicator.py
+++ b/tests/overall_indicator.py
-import json
-import pandas as pd
-import numpy as np
-import re
-from nltk.translate.bleu_score import sentence_bleu
-import time
-import argparse
-import os
-from sklearn.metrics import classification_report,confusion_matrix
-from collections import Counter
-from sklearn import metrics
-from pandas import isnull
-
-
-def indicator_cal(json_standard,json_test):
-
-    json_standard = pd.DataFrame(json_standard)
-    json_test = pd.DataFrame(json_test)
-
-
-
-    '''数据集总体指标'''
-    
-    a=json_test[['id','mid_json']]
-    b=json_standard[['id','mid_json','pass_label']]
-    outer_merge=pd.merge(a,b,on='id',how='outer')
-    outer_merge.columns=['id','standard_mid_json','test_mid_json','pass_label']
-    standard_exist=outer_merge.standard_mid_json.apply(lambda x: not isnull(x))
-    test_exist=outer_merge.test_mid_json.apply(lambda x: not isnull(x))
-
-    overall_report = {}
-    overall_report['accuracy']=metrics.accuracy_score(standard_exist,test_exist)
-    overall_report['precision']=metrics.precision_score(standard_exist,test_exist)
-    overall_report['recall']=metrics.recall_score(standard_exist,test_exist)
-    overall_report['f1_score']=metrics.f1_score(standard_exist,test_exist)
-
-
-    inner_merge=pd.merge(a,b,on='id',how='inner')
-    inner_merge.columns=['id','standard_mid_json','test_mid_json','pass_label']
-    json_standard = inner_merge['standard_mid_json']#check一下是否对齐
-    json_test = inner_merge['test_mid_json']
-
-
-    
-
-    '''批量读取中间生成的json文件'''
-    test_inline_equations=[]
-    test_interline_equations=[]
-    test_inline_euqations_bboxs=[]
-    test_interline_equations_bboxs=[]
-    test_dropped_text_bboxes=[]
-    test_dropped_text_tag=[]
-    test_dropped_image_bboxes=[]
-    test_dropped_table_bboxes=[] 
-    test_preproc_num=[]#阅读顺序
-    test_para_num=[]
-    test_para_text=[]
-
-    for i in json_test:
-        mid_json=pd.DataFrame(i)
-        mid_json=mid_json.iloc[:,:-1]
-        for j1 in mid_json.loc['inline_equations',:]:
-            page_in_text=[]
-            page_in_bbox=[]
-            for k1 in j1:
-                page_in_text.append(k1['latex_text'])
-                page_in_bbox.append(k1['bbox'])
-            test_inline_equations.append(page_in_text)
-            test_inline_euqations_bboxs.append(page_in_bbox)
-        for j2 in mid_json.loc['interline_equations',:]:
-            page_in_text=[]
-            page_in_bbox=[]
-            for k2 in j2:
-                page_in_text.append(k2['latex_text'])
-            test_interline_equations.append(page_in_text)
-            test_interline_equations_bboxs.append(page_in_bbox)
-
-        for j3 in mid_json.loc['droped_text_block',:]:
-            page_in_bbox=[]
-            page_in_tag=[]
-            for k3 in j3:
-                page_in_bbox.append(k3['bbox'])
-                #如果k3中存在tag这个key
-                if 'tag' in k3.keys():
-                    page_in_tag.append(k3['tag'])
-                else:
-                    page_in_tag.append('None')
-            test_dropped_text_tag.append(page_in_tag)
-            test_dropped_text_bboxes.append(page_in_bbox)
-        for j4 in mid_json.loc['droped_image_block',:]:
-                test_dropped_image_bboxes.append(j4)
-        for j5 in mid_json.loc['droped_table_block',:]:
-                test_dropped_table_bboxes.append(j5)
-        for j6 in mid_json.loc['preproc_blocks',:]:
-            page_in=[]
-            for k6 in j6:
-                page_in.append(k6['number'])
-            test_preproc_num.append(page_in)
-
-        test_pdf_text=[]     
-        for j7 in mid_json.loc['para_blocks',:]:
-            test_para_num.append(len(j7))  
-            for k7 in j7:
-                test_pdf_text.append(k7['text'])  
-        test_para_text.append(test_pdf_text)
-
-
-
-    standard_inline_equations=[]
-    standard_interline_equations=[]
-    standard_inline_euqations_bboxs=[]
-    standard_interline_equations_bboxs=[]
-    standard_dropped_text_bboxes=[]
-    standard_dropped_text_tag=[]
-    standard_dropped_image_bboxes=[]
-    standard_dropped_table_bboxes=[] 
-    standard_preproc_num=[]#阅读顺序
-    standard_para_num=[]
-    standard_para_text=[]
-
-    for i in json_standard:
-        mid_json=pd.DataFrame(i)
-        mid_json=mid_json.iloc[:,:-1]
-        for j1 in mid_json.loc['inline_equations',:]:
-            page_in_text=[]
-            page_in_bbox=[]
-            for k1 in j1:
-                page_in_text.append(k1['latex_text'])
-                page_in_bbox.append(k1['bbox'])
-            standard_inline_equations.append(page_in_text)
-            standard_inline_euqations_bboxs.append(page_in_bbox)
-        for j2 in mid_json.loc['interline_equations',:]:
-            page_in_text=[]
-            page_in_bbox=[]
-            for k2 in j2:
-                page_in_text.append(k2['latex_text'])
-                page_in_bbox.append(k2['bbox'])
-            standard_interline_equations.append(page_in_text)
-            standard_interline_equations_bboxs.append(page_in_bbox)
-        for j3 in mid_json.loc['droped_text_block',:]:
-            page_in_bbox=[]
-            page_in_tag=[]
-            for k3 in j3:
-                page_in_bbox.append(k3['bbox'])
-                if 'tag' in k3.keys():
-                    page_in_tag.append(k3['tag'])
-                else:
-                    page_in_tag.append('None')
-            standard_dropped_text_bboxes.append(page_in_bbox)
-            standard_dropped_text_tag.append(page_in_tag)
-        for j4 in mid_json.loc['droped_image_block',:]:
-                standard_dropped_image_bboxes.append(j4)
-        for j5 in mid_json.loc['droped_table_block',:]:
-                standard_dropped_table_bboxes.append(j5)
-        for j6 in mid_json.loc['preproc_blocks',:]:
-            page_in=[]
-            for k6 in j6:
-                page_in.append(k6['number'])
-            standard_preproc_num.append(page_in)     
-
-        standard_pdf_text=[]
-        for j7 in mid_json.loc['para_blocks',:]:
-            standard_para_num.append(len(j7))  
-            for k7 in j7:
-                standard_pdf_text.append(k7['text'])
-        standard_para_text.append(standard_pdf_text)
-
-
-    """
-    在计算指标之前最好先确认基本统计信息是否一致
-    """
-
-
-    '''
-    计算pdf之间的总体编辑距离和bleu
-    这里只计算正例的pdf
-    '''
-    
-    test_para_text=np.asarray(test_para_text, dtype = object)[inner_merge['pass_label']=='yes']
-    standard_para_text=np.asarray(standard_para_text, dtype = object)[inner_merge['pass_label']=='yes']
-
-    pdf_dis=[]
-    pdf_bleu=[]
-    for a,b in zip(test_para_text,standard_para_text):
-        a1=[ ''.join(i) for i in a]
-        b1=[ ''.join(i) for i in b]
-        pdf_dis.append(Levenshtein_Distance(a1,b1))
-        pdf_bleu.append(sentence_bleu([a1],b1))
-    overall_report['pdf间的平均编辑距离']=np.mean(pdf_dis)
-    overall_report['pdf间的平均bleu']=np.mean(pdf_bleu)
-
-
-    '''行内公式编辑距离和bleu'''
-    dis1=[]
-    bleu1=[]
-
-    test_inline_equations=[ ''.join(i) for i in test_inline_equations]
-    standard_inline_equations=[ ''.join(i) for i in standard_inline_equations]
-           
-    for a,b in zip(test_inline_equations,standard_inline_equations):
-        if len(a)==0 and len(b)==0:
-            continue
-        else:
-            if a==b:
-                dis1.append(0)
-                bleu1.append(1)
-            else:
-                dis1.append(Levenshtein_Distance(a,b))
-                bleu1.append(sentence_bleu([a],b))
-    inline_equations_edit=np.mean(dis1)
-    inline_equations_bleu=np.mean(bleu1)
-
-    '''行内公式bbox匹配相关指标'''
-    inline_equations_bbox_report=bbox_match_indicator(test_inline_euqations_bboxs,standard_inline_euqations_bboxs)
-
-
-    '''行间公式编辑距离和bleu'''
-    dis2=[]
-    bleu2=[]
-
-    test_interline_equations=[ ''.join(i) for i in test_interline_equations]
-    standard_interline_equations=[ ''.join(i) for i in standard_interline_equations]
-
-    for a,b in zip(test_interline_equations,standard_interline_equations):
-        if len(a)==0 and len(b)==0:
-            continue
-        else:
-            if a==b:
-                dis2.append(0)
-                bleu2.append(1)
-            else:
-                dis2.append(Levenshtein_Distance(a,b))
-                bleu2.append(sentence_bleu([a],b))
-    interline_equations_edit=np.mean(dis2)
-    interline_equations_bleu=np.mean(bleu2)
-
-
-    '''行间公式bbox匹配相关指标'''
-    interline_equations_bbox_report=bbox_match_indicator(test_interline_equations_bboxs,standard_interline_equations_bboxs)
-
-
-
-
-    '''可以先检查page和bbox数量是否一致'''
-
-    '''dropped_text_block的bbox匹配相关指标'''
-    test_text_bbox=[]
-    standard_text_bbox=[]
-    test_tag=[]
-    standard_tag=[]
-
-
-    index=0
-    for a,b in zip(test_dropped_text_bboxes,standard_dropped_text_bboxes):
-        test_page_tag=[]
-        standard_page_tag=[]
-        test_page_bbox=[]
-        standard_page_bbox=[]
-        if len(a)==0 and len(b)==0:
-            pass
-        else:
-            for i in range(len(b)):
-                judge=0
-                standard_page_tag.append(standard_dropped_text_tag[index][i])
-                standard_page_bbox.append(1)
-                for j in range(len(a)):
-                    if bbox_offset(b[i],a[j]):
-                        judge=1
-                        test_page_tag.append(test_dropped_text_tag[index][j])
-                        test_page_bbox.append(1)
-                        break
-                if judge==0:
-                    test_page_tag.append('None')
-                    test_page_bbox.append(0)
-
-
-            if len(test_dropped_text_tag[index])+test_page_tag.count('None')>len(standard_dropped_text_tag[index]):#有多删的情况出现
-                test_page_tag1=test_page_tag.copy()
-                if 'None' in test_page_tag:
-                    test_page_tag1=test_page_tag1.remove('None')
-                else:
-                    test_page_tag1=test_page_tag
-
-                diff=list((Counter(test_dropped_text_tag[index]) - Counter(test_page_tag1)).elements())
-              
-                test_page_tag.extend(diff)
-                standard_page_tag.extend(['None']*len(diff))
-                test_page_bbox.extend([1]*len(diff))
-                standard_page_bbox.extend([0]*len(diff))
-
-            test_tag.extend(test_page_tag)
-            standard_tag.extend(standard_page_tag)
-            test_text_bbox.extend(test_page_bbox)
-            standard_text_bbox.extend(standard_page_bbox)
-
-        index+=1
-
-    
-    text_block_report = {}
-    text_block_report['accuracy']=metrics.accuracy_score(standard_text_bbox,test_text_bbox)
-    text_block_report['precision']=metrics.precision_score(standard_text_bbox,test_text_bbox)
-    text_block_report['recall']=metrics.recall_score(standard_text_bbox,test_text_bbox)
-    text_block_report['f1_score']=metrics.f1_score(standard_text_bbox,test_text_bbox)
-
-    '''删除的text_block的tag的准确率,召回率和f1-score'''
-    text_block_tag_report = classification_report(y_true=standard_tag , y_pred=test_tag,output_dict=True)
-    del text_block_tag_report['None']
-    del text_block_tag_report["macro avg"]
-    del text_block_tag_report["weighted avg"]
-
-
-    '''dropped_image_block的bbox匹配相关指标'''
-    '''有数据格式不一致的问题'''
-    image_block_report=bbox_match_indicator(test_dropped_image_bboxes,standard_dropped_image_bboxes)
-    
-    
-    '''dropped_table_block的bbox匹配相关指标'''
-    table_block_report=bbox_match_indicator(test_dropped_table_bboxes,standard_dropped_table_bboxes)
-    
-   
-    '''阅读顺序编辑距离的均值'''
-    preproc_num_dis=[]
-    for a,b in zip(test_preproc_num,standard_preproc_num):
-        preproc_num_dis.append(Levenshtein_Distance(a,b))
-    preproc_num_edit=np.mean(preproc_num_dis)
-
-
-
-    '''分段准确率'''
-    test_para_num=np.array(test_para_num)
-    standard_para_num=np.array(standard_para_num)
-    acc_para=np.mean(test_para_num==standard_para_num)
-
-    
-    output=pd.DataFrame()
-    output['总体指标']=[overall_report]
-    output['行内公式平均编辑距离']=[inline_equations_edit]
-    output['行间公式平均编辑距离']=[interline_equations_edit]
-    output['行内公式平均bleu']=[inline_equations_bleu]
-    output['行间公式平均bleu']=[interline_equations_bleu]
-    output['行内公式识别相关指标']=[inline_equations_bbox_report]
-    output['行间公式识别相关指标']=[interline_equations_bbox_report]
-    output['阅读顺序平均编辑距离']=[preproc_num_edit]
-    output['分段准确率']=[acc_para]
-    output['删除的text block的相关指标']=[text_block_report]
-    output['删除的image block的相关指标']=[image_block_report]
-    output['删除的table block的相关指标']=[table_block_report]
-    output['删除的text block的tag相关指标']=[text_block_tag_report]
-    
-
-    return output
-
-"""
-计算编辑距离
-"""
-def Levenshtein_Distance(str1, str2):
-    matrix = [[ i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
-    for i in range(1, len(str1)+1):
-        for j in range(1, len(str2)+1):
-            if(str1[i-1] == str2[j-1]):
-                d = 0
-            else:
-                d = 1
-            matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+d)
-    return matrix[len(str1)][len(str2)]
-
-
-'''
-计算bbox偏移量是否符合标准的函数
-'''
-def bbox_offset(b_t,b_s):
-    '''b_t是test_doc里的bbox,b_s是standard_doc里的bbox'''
-    x1_t,y1_t,x2_t,y2_t=b_t
-    x1_s,y1_s,x2_s,y2_s=b_s
-    x1=max(x1_t,x1_s)
-    x2=min(x2_t,x2_s)
-    y1=max(y1_t,y1_s)
-    y2=min(y2_t,y2_s)
-    area_overlap=(x2-x1)*(y2-y1)
-    area_t=(x2_t-x1_t)*(y2_t-y1_t)+(x2_s-x1_s)*(y2_s-y1_s)-area_overlap
-    if area_t-area_overlap==0 or area_overlap/(area_t-area_overlap)>0.95:
-        return True
-    else:
-        return False
-    
-
-'''bbox匹配和对齐函数，输出相关指标'''
-'''输入的是以page为单位的bbox列表'''
-def bbox_match_indicator(test_bbox_list,standard_bbox_list):
-    
-    test_bbox=[]
-    standard_bbox=[]
-    for a,b in zip(test_bbox_list,standard_bbox_list):
-
-        test_page_bbox=[]
-        standard_page_bbox=[]
-        if len(a)==0 and len(b)==0:
-            pass
-        else:
-            for i in b:
-                if len(i)!=4:
-                    continue
-                else:
-                    judge=0
-                    standard_page_bbox.append(1)
-                    for j in a:
-                        if bbox_offset(i,j):
-                            judge=1
-                            test_page_bbox.append(1)
-                            break
-                    if judge==0:
-                        test_page_bbox.append(0)
-                        
-            diff_num=len(a)+test_page_bbox.count(0)-len(b)
-            if diff_num>0:#有多删的情况出现
-                test_page_bbox.extend([1]*diff_num)
-                standard_page_bbox.extend([0]*diff_num)
-
-          
-            test_bbox.extend(test_page_bbox)
-            standard_bbox.extend(standard_page_bbox)
-
-    
-    block_report = {}
-    block_report['accuracy']=metrics.accuracy_score(standard_bbox,test_bbox)
-    block_report['precision']=metrics.precision_score(standard_bbox,test_bbox)
-    block_report['recall']=metrics.recall_score(standard_bbox,test_bbox)
-    block_report['f1_score']=metrics.f1_score(standard_bbox,test_bbox)
-
-    return block_report
-
-
-
-
-   
-parser = argparse.ArgumentParser()
-parser.add_argument('--test', type=str)
-parser.add_argument('--standard', type=str)
-args = parser.parse_args()
-pdf_json_test = args.test
-pdf_json_standard = args.standard
-
-
-
-if __name__ == '__main__':
-    
-   pdf_json_test = [json.loads(line) 
-                        for line in open(pdf_json_test, 'r', encoding='utf-8')]
-   pdf_json_standard = [json.loads(line) 
-                    for line in open(pdf_json_standard, 'r', encoding='utf-8')]
-   
-   overall_indicator=indicator_cal(pdf_json_standard,pdf_json_test)
-
-   '''计算的指标输出到overall_indicator_output.json中'''
-   overall_indicator.to_json('overall_indicator_output.json',orient='records',lines=True,force_ascii=False)
-    
\ No newline at end of file
--- a/tests/preproc_2_parasplit_example.json
+++ b/tests/preproc_2_parasplit_example.json
--- a/tests/pymu_textblocks.json
+++ b/tests/pymu_textblocks.json
--- a/tests/test_bookname.txt
+++ b/tests/test_bookname.txt
-    # "scihub/scihub_37000000/libgen.scimag37068000-37068999.zip_10.1080/0015587X.1936.9718622" 新的特殊扫描版，需要更新检测规则
-    # demo_meta_scan(book_name="zlib/zlib_21814955")
-    # demo_meta_scan(book_name="zlib/zlib_22115997")  # meta_scan
-    # demo_classify_by_type("scihub/scihub_04600000/libgen.scimag04690000-04690999.zip_10.1016/s0378-4347(98)00269-2") # classify
-    # demo_parse_pdf("scihub/scihub_28400000/libgen.scimag28413000-28413999.zip_10.2307/1316224")
-    # demo_parse_pdf(book_name="scihub/scihub_65300000/libgen.scimag65336000-65336999.zip_10.1021/acs.jcim.7b00151")  # parse_pdf
-    # demo_parse_pdf(book_name="scihub/scihub_76500000/libgen.scimag76506000-76506999.zip_10.1016/j.nanoen.2019.103943") # parse_pdf
-    # demo_parse_pdf(book_name="zlib/zlib_22115997")  # parse_pdf
-    # demo_parse_pdf(book_name="zlib/zlib_21814957")  # parse_pdf
-    # demo_parse_pdf(book_name="zlib/zlib_21929367",start_page_id=48)  # parse_pdf
-    # demo_save_tables(book_name="scihub/scihub_17200000/libgen.scimag17236000-17236999.zip_10.1016/s0002-9440(10)65013-4")  # parse_pdf
-    # demo_parse_pdf(book_name="scihub/scihub_50200000/libgen.scimag50226000-50226999.zip_10.0000/cyberleninka.ru/article/n/chislennoe-modelirovanie-udarnogo-vozdeystviya-vysokoskorostnoy-strui-na-tverdoe-telo")
-
-    # 无限循环爆内存
-    # demo_parse_pdf(book_name="scihub/scihub_10400000/libgen.scimag10461000-10461999.zip_10.1038/4671055a")
-    # demo_parse_pdf(book_name="scihub/scihub_36400000/libgen.scimag36418000-36418999.zip_10.1038/scientificamerican0115-82b")
-    # 10000页的pdf，中间态dict过大
-    # demo_parse_pdf(book_name="zlib/zlib_17498382")
-    # demo_parse_pdf(book_name="zlib/zlib_22006221")
-
-    # footnote
-    # demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09782000-09782999.zip_10.1111/j.1540-627x.2006.00176.x")
-    # demo_parse_pdf(book_name="scihub/scihub_17400000/libgen.scimag17488000-17488999.zip_10.1016/s0043-1354(02)00581-x")
-    # demo_parse_pdf(book_name="scihub/scihub_17000000/libgen.scimag17000000-17000999.zip_10.1016/j.pain.2004.06.005")
-    # demo_parse_pdf(book_name="scihub/scihub_46700000/libgen.scimag46727000-46727999.zip_10.2174/157341210791202627")
-    # demo_parse_pdf(book_name="scihub/scihub_68900000/libgen.scimag68948000-68948999.zip_10.1002/uog.18760")
-    # demo_parse_pdf(book_name="scihub/scihub_64600000/libgen.scimag64628000-64628999.zip_10.3892/mmr.2017.6343")
-    # demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scimag47212000-47212999.zip_10.7589/0090-3558-40.3.579")
-    # demo_parse_pdf(book_name="scihub/scihub_30400000/libgen.scimag30438000-30438999.zip_10.1021/ja048851k")
-    # demo_parse_pdf(book_name="scihub/scihub_43600000/libgen.scimag43628000-43628999.zip_10.1093/toxsci/kfi111")
-    # demo_parse_pdf(book_name="scihub/scihub_14000000/libgen.scimag14081000-14081999.zip_10.1016/s0923-4748(00)00034-5")
-    # demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scimag47212000-47212999.zip_10.7589/0090-3558-40.3.579")
-    demo_parse_pdf(book_name="scihub/scihub_43200000/libgen.scimag43208000-43208999.zip_10.1645/0022-3395(2000)086[0275:podsri]2.0.co;2")
-    # demo_parse_pdf(book_name="scihub/scihub_55700000/libgen.scimag55717000-55717999.zip_10.1007/s10067-016-3303-0")
-
-    # 内容缺失
-    # demo_parse_pdf(book_name="scihub/scihub_76900000/libgen.scimag76990000-76990999.zip_10.1145/3314111.3319829")
-    # demo_parse_pdf(book_name="scihub/scihub_84600000/libgen.scimag84652000-84652999.zip_10.1163/1876312x-00001010")
-
-    # block line乱序
-    # demo_parse_pdf(book_name="scihub/scihub_04800000/libgen.scimag04807000-04807999.zip_10.1016/s0927-7765(97)00029-5")
-    # demo_parse_pdf(book_name="scihub/scihub_37000000/libgen.scimag37007000-37007999.zip_10.0000/docserver.ingentaconnect.com/generic-24bf2a7237e7")
-    # demo_parse_pdf(book_name="scihub/scihub_44700000/libgen.scimag44733000-44733999.zip_10.1063/1.3631048")
-    # demo_parse_pdf(book_name="scihub/scihub_21100000/libgen.scimag21124000-21124999.zip_10.1080/10447318.2002.9669130")
-    # demo_parse_pdf(book_name="scihub/scihub_80100000/libgen.scimag80185000-80185999.zip_10.1353/sib.0.0003")
-
-    # footnote
-    demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09782000-09782999.zip_10.1111/j.1540-627x.2006.00176.x")
-    demo_parse_pdf(book_name="scihub/scihub_17400000/libgen.scimag17488000-17488999.zip_10.1016/s0043-1354(02)00581-x")
-    demo_parse_pdf(book_name="scihub/scihub_17000000/libgen.scimag17000000-17000999.zip_10.1016/j.pain.2004.06.005")
-    demo_parse_pdf(book_name="scihub/scihub_46700000/libgen.scimag46727000-46727999.zip_10.2174/157341210791202627")
-    demo_parse_pdf(book_name="scihub/scihub_64600000/libgen.scimag64628000-64628999.zip_10.3892/mmr.2017.6343")
-    demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scimag47212000-47212999.zip_10.7589/0090-3558-40.3.579")
-    demo_parse_pdf(book_name="scihub/scihub_43600000/libgen.scimag43628000-43628999.zip_10.1093/toxsci/kfi111")
-    demo_parse_pdf(book_name="scihub/scihub_14000000/libgen.scimag14081000-14081999.zip_10.1016/s0923-4748(00)00034-5")
-    demo_parse_pdf(book_name="scihub/scihub_55700000/libgen.scimag55717000-55717999.zip_10.1007/s10067-016-3303-0")
-    demo_parse_pdf(book_name="scihub/scihub_86500000/libgen.scimag86560000-86560999.zip_10.1007/s10995-021-03207-2")
-    demo_parse_pdf(book_name="scihub/scihub_42100000/libgen.scimag42162000-42162999.zip_10.1093/notesj/gjm116")
-    demo_parse_pdf(book_name="scihub/scihub_07500000/libgen.scimag07500000-07500999.zip_10.1007/s00412-005-0007-7")
-    demo_parse_pdf(book_name="scihub/scihub_50800000/libgen.scimag50827000-50827999.zip_10.1210/jc.2015-4251")
-    demo_parse_pdf(book_name="scihub/scihub_07500000/libgen.scimag07537000-07537999.zip_10.1007/s004320050323")
-    demo_parse_pdf(book_name="scihub/scihub_24600000/libgen.scimag24665000-24665999.zip_10.1016/S0387-7604(89)80007-5")
-    demo_parse_pdf(book_name="scihub/scihub_76200000/libgen.scimag76297000-76297999.zip_10.4018/jehmc.2011040101")
-    demo_parse_pdf(book_name="scihub/scihub_29400000/libgen.scimag29456000-29456999.zip_10.1177/0883911505049656")
-    demo_parse_pdf(book_name="scihub/scihub_30200000/libgen.scimag30263000-30263999.zip_10.1081/scc-200036639")
-    demo_parse_pdf(book_name="scihub/scihub_71200000/libgen.scimag71224000-71224999.zip_10.1038/s41396-018-0231-9")
-    demo_parse_pdf(book_name="scihub/scihub_30100000/libgen.scimag30175000-30175999.zip_10.1300/j035v17n04_03")
-    demo_parse_pdf(book_name="scihub/scihub_18900000/libgen.scimag18981000-18981999.zip_10.1016/j.neuroimage.2006.06.030")
-    demo_parse_pdf(book_name="scihub/scihub_53100000/libgen.scimag53119000-53119999.zip_10.1097/01.npt.0000282350.63993.7a")
-    demo_parse_pdf(book_name="scihub/scihub_10000000/libgen.scimag10071000-10071999.zip_10.1111/j.1750-8606.2011.00190.x")
-    demo_parse_pdf(book_name="scihub/scihub_64000000/libgen.scimag64030000-64030999.zip_10.1080/1612197X.2017.1292302")
-    demo_parse_pdf(book_name="scihub/scihub_86000000/libgen.scimag86085000-86085999.zip_10.1016/j.enzmictec.2020.109742")
-    demo_parse_pdf(book_name="scihub/scihub_51000000/libgen.scimag51049000-51049999.zip_10.1117/12.2227997")
-    demo_parse_pdf(book_name="scihub/scihub_43700000/libgen.scimag43768000-43768999.zip_10.1063/1.4895640")
-    demo_parse_pdf(book_name="scihub/scihub_05000000/libgen.scimag05036000-05036999.zip_10.1046/j.1365-2036.2000.00699.x")
-
-    #色块内文本
-    # demo_parse_pdf(book_name="scihub/scihub_87200000/libgen.scimag87202000-87202999.zip_10.1080/10220461.2021.1894971")
-
-    # 0226
-    demo_parse_pdf(book_name="scihub/scihub_76100000/libgen.scimag76174000-76174999.zip_10.1016/j.annemergmed.2019.01.040")
-    demo_parse_pdf(book_name="scihub/scihub_37400000/libgen.scimag37409000-37409999.zip_10.1177/0961203307085251")
-    demo_parse_pdf(book_name="scihub/scihub_26100000/libgen.scimag26107000-26107999.zip_10.1002/ajim.22195")
-    demo_parse_pdf(book_name="scihub/scihub_13100000/libgen.scimag13186000-13186999.zip_10.1016/j.brainres.2006.02.013")
-    demo_parse_pdf(book_name="scihub/scihub_75200000/libgen.scimag75297000-75297999.zip_10.1177/00393207160461-203")
-    demo_parse_pdf(book_name="scihub/scihub_34200000/libgen.scimag34255000-34255999.zip_10.1093/ojls/gqi025")
-    demo_parse_pdf(book_name="scihub/scihub_11300000/libgen.scimag11331000-11331999.zip_10.1208/s12249-011-9638-6")
\ No newline at end of file
--- a/tests/test_commons.py
+++ b/tests/test_commons.py
-import io
-import json
-import os
-
-import boto3
-from botocore.config import Config
-
-from magic_pdf.libs.commons import fitz
-from magic_pdf.libs.config_reader import get_s3_config_dict
-
-from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
-from loguru import logger
-
-test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
-
-
-def get_test_pdf_json(book_name):
-    json_path = join_path(json_dump_path, book_name + ".json")
-    s3_config = get_s3_config_dict(json_path)
-    file_content = read_file(json_path, s3_config)
-    json_str = file_content.decode('utf-8')
-    json_object = json.loads(json_str)
-    return json_object
-
-
-def read_test_file(book_name):
-    test_pdf_path = join_path(test_pdf_dir_path, book_name + ".pdf")
-    s3_config = get_s3_config_dict(test_pdf_path)
-    try:
-        file_content = read_file(test_pdf_path, s3_config)
-        return file_content
-    except Exception as e:
-        if "NoSuchKey" in str(e):
-            logger.warning("File not found in test_pdf_path. Downloading from orig_s3_pdf_path.")
-            try:
-                json_object = get_test_pdf_json(book_name)
-                orig_s3_pdf_path = json_object.get('file_location')
-                s3_config = get_s3_config_dict(orig_s3_pdf_path)
-                file_content = read_file(orig_s3_pdf_path, s3_config)
-                s3_client = get_s3_client(test_pdf_path)
-                bucket_name, bucket_key = parse_bucket_key(test_pdf_path)
-                file_obj = io.BytesIO(file_content)
-                s3_client.upload_fileobj(file_obj, bucket_name, bucket_key)
-                return file_content
-            except Exception as e:
-                logger.exception(e)
-        else:
-            logger.exception(e)
-
-
-def get_docs_from_test_pdf(book_name):
-    file_content = read_test_file(book_name)
-    return fitz.open("pdf", file_content)
-
-
-def get_test_json_data(directory_path, json_file_name):
-    with open(os.path.join(directory_path, json_file_name), "r", encoding='utf-8') as f:
-        test_data = json.load(f)
-    return test_data
-
-
-def get_s3_client(path):
-    s3_config = get_s3_config_dict(path)
-    try:
-        return boto3.client(
-            "s3",
-            aws_access_key_id=s3_config["ak"],
-            aws_secret_access_key=s3_config["sk"],
-            endpoint_url=s3_config["endpoint"],
-            config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8, "mode": "standard"}),
-        )
-    except:
-        # older boto3 do not support retries.mode param.
-        return boto3.client(
-            "s3",
-            aws_access_key_id=s3_config["ak"],
-            aws_secret_access_key=s3_config["sk"],
-            endpoint_url=s3_config["endpoint"],
-            config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8}),
-        )
--- a/tests/test_data/__init__.py
+++ b/tests/test_data/__init__.py
--- a/tests/test_data/assets/jsonl/test_01.jsonl
+++ b/tests/test_data/assets/jsonl/test_01.jsonl
-{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
--- a/tests/test_data/assets/jsonl/test_02.jsonl
+++ b/tests/test_data/assets/jsonl/test_02.jsonl
-{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
--- a/tests/test_data/assets/pdfs/test_01.pdf
+++ b/tests/test_data/assets/pdfs/test_01.pdf
--- a/tests/test_data/assets/pdfs/test_02.pdf
+++ b/tests/test_data/assets/pdfs/test_02.pdf
--- a/tests/test_data/assets/pngs/test_01.png
+++ b/tests/test_data/assets/pngs/test_01.png
--- a/tests/test_data/assets/pngs/test_02.png
+++ b/tests/test_data/assets/pngs/test_02.png
--- a/tests/test_data/data_reader_writer/__init__.py
+++ b/tests/test_data/data_reader_writer/__init__.py
--- a/tests/test_data/data_reader_writer/test_filebase.py
+++ b/tests/test_data/data_reader_writer/test_filebase.py
-import os
-import shutil
-
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               FileBasedDataWriter)
-
-
-def test_filebased_reader_writer():
-
-    unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
-    sub_dir = os.path.join(unitest_dir, 'sub')
-    abs_fn = os.path.join(unitest_dir, 'abspath.txt')
-
-    os.makedirs(sub_dir, exist_ok=True)
-
-    writer = FileBasedDataWriter(sub_dir)
-    reader = FileBasedDataReader(sub_dir)
-
-    writer.write('test.txt', b'hello world')
-    assert reader.read('test.txt') == b'hello world'
-
-    writer.write(abs_fn, b'hello world')
-    assert reader.read(abs_fn) == b'hello world'
-    shutil.rmtree(unitest_dir)
--- a/tests/test_data/data_reader_writer/test_multi_bucket_s3.py
+++ b/tests/test_data/data_reader_writer/test_multi_bucket_s3.py
--- a/tests/test_data/data_reader_writer/test_s3.py
+++ b/tests/test_data/data_reader_writer/test_s3.py
--- a/tests/test_data/io/__init__.py
+++ b/tests/test_data/io/__init__.py
--- a/tests/test_data/io/test_s3.py
+++ b/tests/test_data/io/test_s3.py
-import json
-import os
-
-import pytest
-
-from magic_pdf.data.io.s3 import S3Reader, S3Writer
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
-)
-def test_s3_reader():
-    """test s3 reader.
-
-    must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
-    export S3_ENDPOINT=xxx
-    """
-
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-    reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
-    bits = reader.read(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
-    )
-    assert len(bits) > 0
-
-    bits = reader.read_at(
-        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
-        566,
-        713,
-    )
-    assert len(json.loads(bits)) > 0
-
-
-@pytest.mark.skipif(
-    os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
-)
-def test_s3_writer():
-    """test s3 reader.
-
-    must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
-    export S3_ENDPOINT=xxx
-    """
-    bucket = os.getenv('S3_BUCKET', '')
-    ak = os.getenv('S3_ACCESS_KEY', '')
-    sk = os.getenv('S3_SECRET_KEY', '')
-    endpoint_url = os.getenv('S3_ENDPOINT', '')
-    writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
-    test_fn = 'unittest/io/test.jsonl'
-    writer.write(test_fn, '123'.encode())
-    reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
-    bits = reader.read(test_fn)
-    assert bits.decode() == '123'
--- a/tests/test_data/test_dataset.py
+++ b/tests/test_data/test_dataset.py