Unverified Commit b912797a authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #913 from DTwz/dev

Modify the test directory
parents 1e37e199 a09d9e8c
{
"temp-output-dir": "/tmp/"
}
import json
import pandas as pd
import numpy as np
import re
from nltk.translate.bleu_score import sentence_bleu
import time
import argparse
import os
from sklearn.metrics import classification_report,confusion_matrix
from collections import Counter
from sklearn import metrics
from pandas import isnull
def indicator_cal(json_standard,json_test):
json_standard = pd.DataFrame(json_standard)
json_test = pd.DataFrame(json_test)
'''数据集总体指标'''
a=json_test[['id','mid_json']]
b=json_standard[['id','mid_json','pass_label']]
outer_merge=pd.merge(a,b,on='id',how='outer')
outer_merge.columns=['id','standard_mid_json','test_mid_json','pass_label']
standard_exist=outer_merge.standard_mid_json.apply(lambda x: not isnull(x))
test_exist=outer_merge.test_mid_json.apply(lambda x: not isnull(x))
overall_report = {}
overall_report['accuracy']=metrics.accuracy_score(standard_exist,test_exist)
overall_report['precision']=metrics.precision_score(standard_exist,test_exist)
overall_report['recall']=metrics.recall_score(standard_exist,test_exist)
overall_report['f1_score']=metrics.f1_score(standard_exist,test_exist)
inner_merge=pd.merge(a,b,on='id',how='inner')
inner_merge.columns=['id','standard_mid_json','test_mid_json','pass_label']
json_standard = inner_merge['standard_mid_json']#check一下是否对齐
json_test = inner_merge['test_mid_json']
'''批量读取中间生成的json文件'''
test_inline_equations=[]
test_interline_equations=[]
test_inline_euqations_bboxs=[]
test_interline_equations_bboxs=[]
test_dropped_text_bboxes=[]
test_dropped_text_tag=[]
test_dropped_image_bboxes=[]
test_dropped_table_bboxes=[]
test_preproc_num=[]#阅读顺序
test_para_num=[]
test_para_text=[]
for i in json_test:
mid_json=pd.DataFrame(i)
mid_json=mid_json.iloc[:,:-1]
for j1 in mid_json.loc['inline_equations',:]:
page_in_text=[]
page_in_bbox=[]
for k1 in j1:
page_in_text.append(k1['latex_text'])
page_in_bbox.append(k1['bbox'])
test_inline_equations.append(page_in_text)
test_inline_euqations_bboxs.append(page_in_bbox)
for j2 in mid_json.loc['interline_equations',:]:
page_in_text=[]
page_in_bbox=[]
for k2 in j2:
page_in_text.append(k2['latex_text'])
test_interline_equations.append(page_in_text)
test_interline_equations_bboxs.append(page_in_bbox)
for j3 in mid_json.loc['droped_text_block',:]:
page_in_bbox=[]
page_in_tag=[]
for k3 in j3:
page_in_bbox.append(k3['bbox'])
#如果k3中存在tag这个key
if 'tag' in k3.keys():
page_in_tag.append(k3['tag'])
else:
page_in_tag.append('None')
test_dropped_text_tag.append(page_in_tag)
test_dropped_text_bboxes.append(page_in_bbox)
for j4 in mid_json.loc['droped_image_block',:]:
test_dropped_image_bboxes.append(j4)
for j5 in mid_json.loc['droped_table_block',:]:
test_dropped_table_bboxes.append(j5)
for j6 in mid_json.loc['preproc_blocks',:]:
page_in=[]
for k6 in j6:
page_in.append(k6['number'])
test_preproc_num.append(page_in)
test_pdf_text=[]
for j7 in mid_json.loc['para_blocks',:]:
test_para_num.append(len(j7))
for k7 in j7:
test_pdf_text.append(k7['text'])
test_para_text.append(test_pdf_text)
standard_inline_equations=[]
standard_interline_equations=[]
standard_inline_euqations_bboxs=[]
standard_interline_equations_bboxs=[]
standard_dropped_text_bboxes=[]
standard_dropped_text_tag=[]
standard_dropped_image_bboxes=[]
standard_dropped_table_bboxes=[]
standard_preproc_num=[]#阅读顺序
standard_para_num=[]
standard_para_text=[]
for i in json_standard:
mid_json=pd.DataFrame(i)
mid_json=mid_json.iloc[:,:-1]
for j1 in mid_json.loc['inline_equations',:]:
page_in_text=[]
page_in_bbox=[]
for k1 in j1:
page_in_text.append(k1['latex_text'])
page_in_bbox.append(k1['bbox'])
standard_inline_equations.append(page_in_text)
standard_inline_euqations_bboxs.append(page_in_bbox)
for j2 in mid_json.loc['interline_equations',:]:
page_in_text=[]
page_in_bbox=[]
for k2 in j2:
page_in_text.append(k2['latex_text'])
page_in_bbox.append(k2['bbox'])
standard_interline_equations.append(page_in_text)
standard_interline_equations_bboxs.append(page_in_bbox)
for j3 in mid_json.loc['droped_text_block',:]:
page_in_bbox=[]
page_in_tag=[]
for k3 in j3:
page_in_bbox.append(k3['bbox'])
if 'tag' in k3.keys():
page_in_tag.append(k3['tag'])
else:
page_in_tag.append('None')
standard_dropped_text_bboxes.append(page_in_bbox)
standard_dropped_text_tag.append(page_in_tag)
for j4 in mid_json.loc['droped_image_block',:]:
standard_dropped_image_bboxes.append(j4)
for j5 in mid_json.loc['droped_table_block',:]:
standard_dropped_table_bboxes.append(j5)
for j6 in mid_json.loc['preproc_blocks',:]:
page_in=[]
for k6 in j6:
page_in.append(k6['number'])
standard_preproc_num.append(page_in)
standard_pdf_text=[]
for j7 in mid_json.loc['para_blocks',:]:
standard_para_num.append(len(j7))
for k7 in j7:
standard_pdf_text.append(k7['text'])
standard_para_text.append(standard_pdf_text)
"""
在计算指标之前最好先确认基本统计信息是否一致
"""
'''
计算pdf之间的总体编辑距离和bleu
这里只计算正例的pdf
'''
test_para_text=np.asarray(test_para_text, dtype = object)[inner_merge['pass_label']=='yes']
standard_para_text=np.asarray(standard_para_text, dtype = object)[inner_merge['pass_label']=='yes']
pdf_dis=[]
pdf_bleu=[]
for a,b in zip(test_para_text,standard_para_text):
a1=[ ''.join(i) for i in a]
b1=[ ''.join(i) for i in b]
pdf_dis.append(Levenshtein_Distance(a1,b1))
pdf_bleu.append(sentence_bleu([a1],b1))
overall_report['pdf间的平均编辑距离']=np.mean(pdf_dis)
overall_report['pdf间的平均bleu']=np.mean(pdf_bleu)
'''行内公式编辑距离和bleu'''
dis1=[]
bleu1=[]
test_inline_equations=[ ''.join(i) for i in test_inline_equations]
standard_inline_equations=[ ''.join(i) for i in standard_inline_equations]
for a,b in zip(test_inline_equations,standard_inline_equations):
if len(a)==0 and len(b)==0:
continue
else:
if a==b:
dis1.append(0)
bleu1.append(1)
else:
dis1.append(Levenshtein_Distance(a,b))
bleu1.append(sentence_bleu([a],b))
inline_equations_edit=np.mean(dis1)
inline_equations_bleu=np.mean(bleu1)
'''行内公式bbox匹配相关指标'''
inline_equations_bbox_report=bbox_match_indicator(test_inline_euqations_bboxs,standard_inline_euqations_bboxs)
'''行间公式编辑距离和bleu'''
dis2=[]
bleu2=[]
test_interline_equations=[ ''.join(i) for i in test_interline_equations]
standard_interline_equations=[ ''.join(i) for i in standard_interline_equations]
for a,b in zip(test_interline_equations,standard_interline_equations):
if len(a)==0 and len(b)==0:
continue
else:
if a==b:
dis2.append(0)
bleu2.append(1)
else:
dis2.append(Levenshtein_Distance(a,b))
bleu2.append(sentence_bleu([a],b))
interline_equations_edit=np.mean(dis2)
interline_equations_bleu=np.mean(bleu2)
'''行间公式bbox匹配相关指标'''
interline_equations_bbox_report=bbox_match_indicator(test_interline_equations_bboxs,standard_interline_equations_bboxs)
'''可以先检查page和bbox数量是否一致'''
'''dropped_text_block的bbox匹配相关指标'''
test_text_bbox=[]
standard_text_bbox=[]
test_tag=[]
standard_tag=[]
index=0
for a,b in zip(test_dropped_text_bboxes,standard_dropped_text_bboxes):
test_page_tag=[]
standard_page_tag=[]
test_page_bbox=[]
standard_page_bbox=[]
if len(a)==0 and len(b)==0:
pass
else:
for i in range(len(b)):
judge=0
standard_page_tag.append(standard_dropped_text_tag[index][i])
standard_page_bbox.append(1)
for j in range(len(a)):
if bbox_offset(b[i],a[j]):
judge=1
test_page_tag.append(test_dropped_text_tag[index][j])
test_page_bbox.append(1)
break
if judge==0:
test_page_tag.append('None')
test_page_bbox.append(0)
if len(test_dropped_text_tag[index])+test_page_tag.count('None')>len(standard_dropped_text_tag[index]):#有多删的情况出现
test_page_tag1=test_page_tag.copy()
if 'None' in test_page_tag:
test_page_tag1=test_page_tag1.remove('None')
else:
test_page_tag1=test_page_tag
diff=list((Counter(test_dropped_text_tag[index]) - Counter(test_page_tag1)).elements())
test_page_tag.extend(diff)
standard_page_tag.extend(['None']*len(diff))
test_page_bbox.extend([1]*len(diff))
standard_page_bbox.extend([0]*len(diff))
test_tag.extend(test_page_tag)
standard_tag.extend(standard_page_tag)
test_text_bbox.extend(test_page_bbox)
standard_text_bbox.extend(standard_page_bbox)
index+=1
text_block_report = {}
text_block_report['accuracy']=metrics.accuracy_score(standard_text_bbox,test_text_bbox)
text_block_report['precision']=metrics.precision_score(standard_text_bbox,test_text_bbox)
text_block_report['recall']=metrics.recall_score(standard_text_bbox,test_text_bbox)
text_block_report['f1_score']=metrics.f1_score(standard_text_bbox,test_text_bbox)
'''删除的text_block的tag的准确率,召回率和f1-score'''
text_block_tag_report = classification_report(y_true=standard_tag , y_pred=test_tag,output_dict=True)
del text_block_tag_report['None']
del text_block_tag_report["macro avg"]
del text_block_tag_report["weighted avg"]
'''dropped_image_block的bbox匹配相关指标'''
'''有数据格式不一致的问题'''
image_block_report=bbox_match_indicator(test_dropped_image_bboxes,standard_dropped_image_bboxes)
'''dropped_table_block的bbox匹配相关指标'''
table_block_report=bbox_match_indicator(test_dropped_table_bboxes,standard_dropped_table_bboxes)
'''阅读顺序编辑距离的均值'''
preproc_num_dis=[]
for a,b in zip(test_preproc_num,standard_preproc_num):
preproc_num_dis.append(Levenshtein_Distance(a,b))
preproc_num_edit=np.mean(preproc_num_dis)
'''分段准确率'''
test_para_num=np.array(test_para_num)
standard_para_num=np.array(standard_para_num)
acc_para=np.mean(test_para_num==standard_para_num)
output=pd.DataFrame()
output['总体指标']=[overall_report]
output['行内公式平均编辑距离']=[inline_equations_edit]
output['行间公式平均编辑距离']=[interline_equations_edit]
output['行内公式平均bleu']=[inline_equations_bleu]
output['行间公式平均bleu']=[interline_equations_bleu]
output['行内公式识别相关指标']=[inline_equations_bbox_report]
output['行间公式识别相关指标']=[interline_equations_bbox_report]
output['阅读顺序平均编辑距离']=[preproc_num_edit]
output['分段准确率']=[acc_para]
output['删除的text block的相关指标']=[text_block_report]
output['删除的image block的相关指标']=[image_block_report]
output['删除的table block的相关指标']=[table_block_report]
output['删除的text block的tag相关指标']=[text_block_tag_report]
return output
"""
计算编辑距离
"""
def Levenshtein_Distance(str1, str2):
matrix = [[ i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
for i in range(1, len(str1)+1):
for j in range(1, len(str2)+1):
if(str1[i-1] == str2[j-1]):
d = 0
else:
d = 1
matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+d)
return matrix[len(str1)][len(str2)]
'''
计算bbox偏移量是否符合标准的函数
'''
def bbox_offset(b_t,b_s):
'''b_t是test_doc里的bbox,b_s是standard_doc里的bbox'''
x1_t,y1_t,x2_t,y2_t=b_t
x1_s,y1_s,x2_s,y2_s=b_s
x1=max(x1_t,x1_s)
x2=min(x2_t,x2_s)
y1=max(y1_t,y1_s)
y2=min(y2_t,y2_s)
area_overlap=(x2-x1)*(y2-y1)
area_t=(x2_t-x1_t)*(y2_t-y1_t)+(x2_s-x1_s)*(y2_s-y1_s)-area_overlap
if area_t-area_overlap==0 or area_overlap/(area_t-area_overlap)>0.95:
return True
else:
return False
'''bbox匹配和对齐函数,输出相关指标'''
'''输入的是以page为单位的bbox列表'''
def bbox_match_indicator(test_bbox_list,standard_bbox_list):
test_bbox=[]
standard_bbox=[]
for a,b in zip(test_bbox_list,standard_bbox_list):
test_page_bbox=[]
standard_page_bbox=[]
if len(a)==0 and len(b)==0:
pass
else:
for i in b:
if len(i)!=4:
continue
else:
judge=0
standard_page_bbox.append(1)
for j in a:
if bbox_offset(i,j):
judge=1
test_page_bbox.append(1)
break
if judge==0:
test_page_bbox.append(0)
diff_num=len(a)+test_page_bbox.count(0)-len(b)
if diff_num>0:#有多删的情况出现
test_page_bbox.extend([1]*diff_num)
standard_page_bbox.extend([0]*diff_num)
test_bbox.extend(test_page_bbox)
standard_bbox.extend(standard_page_bbox)
block_report = {}
block_report['accuracy']=metrics.accuracy_score(standard_bbox,test_bbox)
block_report['precision']=metrics.precision_score(standard_bbox,test_bbox)
block_report['recall']=metrics.recall_score(standard_bbox,test_bbox)
block_report['f1_score']=metrics.f1_score(standard_bbox,test_bbox)
return block_report
parser = argparse.ArgumentParser()
parser.add_argument('--test', type=str)
parser.add_argument('--standard', type=str)
args = parser.parse_args()
pdf_json_test = args.test
pdf_json_standard = args.standard
if __name__ == '__main__':
pdf_json_test = [json.loads(line)
for line in open(pdf_json_test, 'r', encoding='utf-8')]
pdf_json_standard = [json.loads(line)
for line in open(pdf_json_standard, 'r', encoding='utf-8')]
overall_indicator=indicator_cal(pdf_json_standard,pdf_json_test)
'''计算的指标输出到overall_indicator_output.json中'''
overall_indicator.to_json('overall_indicator_output.json',orient='records',lines=True,force_ascii=False)
\ No newline at end of file
{
"page_0":{
"para_blocks": [
{
"block_id": 0,
"bbox": [39.0, 34.719993591308594, 347.1359558105469, 51.2079963684082],
"text": "IOP Conference Series: Earth and Environmental Science",
"dir": [1.0, 0.0],
"X0": 39.0,
"X1": 347.1359558105469,
"avg_char_width": 6.4194990793863935,
"avg_char_height": 16.48800277709961,
"block_font_type": "Helvetica",
"block_font_size": 12.0,
"is_segmented": 1,
"paras": [
{
"para_id": 0,
"bbox": [39.0, 34.719993591308594, 347.1359558105469, 51.2079963684082],
"text": "IOP Conference Series: Earth and Environmental Science",
"is_matched": 1,
"is_title": 0,
"font_type": "Helvetica",
"font_size": 12.0,
"font_color": 0,
"neighbor_paras": [null, null]
}
],
"bboxes_para": [[39.0, 34.719993591308594, 347.1359558105469, 51.2079963684082]]
},
{
"block_id": 1,
"bbox": [39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625],
"text": "PAPER • OPEN ACCESS",
"dir": [1.0, 0.0],
"X0": 39.0,
"X1": 143.67001342773438,
"avg_char_width": 6.541875839233398,
"avg_char_height": 12.392997741699219,
"block_font_type": "Helvetica-Bold",
"block_font_size": 9.0,
"is_segmented": 1,
"paras": [
{
"para_id": 0,
"bbox": [39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625],
"text": "PAPER • OPEN ACCESS",
"is_matched": 1,
"is_title": 0,
"font_type": "Helvetica-Bold",
"font_size": 9.0,
"font_color": 0,
"neighbor_paras": [null, null]
},
{
"para_id": 1,
"bbox": [39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625],
"text": "PAPER • OPEN ACCESS",
"is_matched": 1,
"is_title": 0,
"font_type": "Helvetica-Bold",
"font_size": 9.0,
"font_color": 0,
"neighbor_paras": [null, null]
}
],
"bboxes_para": [[39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625]]
}
],
"preproc_blocks":[ //这里已经把重叠,页眉,页脚,垂直,旋转,水印,图片,表格删掉了
{
"number": 0,
"type": 0,
"bbox": [
428.93170166015625,
744.921142578125,
541.5675048828125,
757.8131713867188
],
"lines": [
{
"spans": [
{
"size": 11.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 0,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "3",
"origin": [
536.37548828125,
755.3601684570312
],
"bbox": [
536.37548828125,
744.921142578125,
541.5675048828125,
757.8131713867188
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
536.37548828125,
744.921142578125,
541.5675048828125,
757.8131713867188
]
},
{
"spans": [
{
"size": 8.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 0,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "Spektrum ",
"origin": [
428.93170166015625,
755.3601684570312
],
"bbox": [
428.93170166015625,
747.7681884765625,
458.7516174316406,
757.1441650390625
]
},
{
"size": 8.0,
"flags": 4,
"font": "UniversNextPro-Cond",
"color": 0,
"ascender": 0.9359999895095825,
"descender": -0.21400000154972076,
"text": "der Wissenschaft ",
"origin": [
458.431884765625,
755.3601684570312
],
"bbox": [
458.431884765625,
747.8721923828125,
508.0399169921875,
757.0721435546875
]
},
{
"size": 8.0,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "7.21",
"origin": [
510.2349853515625,
755.3601684570312
],
"bbox": [
510.2349853515625,
747.9281616210938,
524.5621948242188,
757.1361694335938
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
428.93170166015625,
747.7681884765625,
524.5621948242188,
757.1441650390625
]
}
]
}
],
"images":[
{
"bbox":[0,0,1,1],
"image_path":"path/to/image.jpg"
},
{
"bbox":[1,2,3,4],
"image_path":"path/to/image.jpg"
}
],
"tables":[
{
"bbox":[0,0,1,1],
"image_path":"path/to/image.jpg"
},
{
"bbox":[1,2,3,4],
"image_path":"path/to/image.jpg"
}
],
"interline_equations":[
{
"bbox":[0,0,1,1],
"image_path":"path/to/equation.jpg"
},
{
"bbox":[1,2,3,4],
"image_path":"path/to/equation.jpg"
}
],
"inline_equations":[
{
"bbox":[0,0,1,1],
"image_path":"path/to/equation.jpg"
},
{
"bbox":[1,2,3,4],
"image_path":"path/to/equation.jpg"
}
],
"layout_bboxes":[
{
"layout_bbox": [0,0, 1,1],
"layout_label":"V|H|B" //未处理|垂直|水平|BAD_LAYOUT
},
{
"layout_bbox": [1,2,3,4],
"layout_label":"V|H|B"
}
],
"pymu_raw_blocks":[], //未删减的pymupdf的block,含文字图片等
"global_statistic":{//全局性统计信息
},
"droped_text_block":[//被丢弃的文字
],
"droped_image_block":[
],
"droped_table_block":[
],
"image_backup":[//暂时不参与处理的图片,例如互相层叠的图片,先放这里,最后组合的时候放到页面开头段落之后。
],
"table_backup":[//同上
]
},
"page_1":{
}
}
\ No newline at end of file
[
{
"number": 0,
"type": 0,
"bbox": [
428.93170166015625,
744.921142578125,
541.5675048828125,
757.8131713867188
],
"lines": [
{
"spans": [
{
"size": 11.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 0,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "3",
"origin": [
536.37548828125,
755.3601684570312
],
"bbox": [
536.37548828125,
744.921142578125,
541.5675048828125,
757.8131713867188
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
536.37548828125,
744.921142578125,
541.5675048828125,
757.8131713867188
]
},
{
"spans": [
{
"size": 8.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 0,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "Spektrum ",
"origin": [
428.93170166015625,
755.3601684570312
],
"bbox": [
428.93170166015625,
747.7681884765625,
458.7516174316406,
757.1441650390625
]
},
{
"size": 8.0,
"flags": 4,
"font": "UniversNextPro-Cond",
"color": 0,
"ascender": 0.9359999895095825,
"descender": -0.21400000154972076,
"text": "der Wissenschaft ",
"origin": [
458.431884765625,
755.3601684570312
],
"bbox": [
458.431884765625,
747.8721923828125,
508.0399169921875,
757.0721435546875
]
},
{
"size": 8.0,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "7.21",
"origin": [
510.2349853515625,
755.3601684570312
],
"bbox": [
510.2349853515625,
747.9281616210938,
524.5621948242188,
757.1361694335938
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
428.93170166015625,
747.7681884765625,
524.5621948242188,
757.1441650390625
]
}
]
},
{
"number": 1,
"type": 0,
"bbox": [
41.19110107421875,
182.531494140625,
67.2332992553711,
208.57369995117188
],
"lines": [
{
"spans": [
{
"size": 26.042200088500977,
"flags": 0,
"font": "Webdings",
"color": 0,
"ascender": 0.7998046875,
"descender": -0.2001953125,
"text": "\uf034",
"origin": [
41.19110107421875,
203.36016845703125
],
"bbox": [
41.19110107421875,
182.531494140625,
67.2332992553711,
208.57369995117188
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
41.19110107421875,
182.531494140625,
67.2332992553711,
208.57369995117188
]
}
]
},
{
"number": 2,
"type": 0,
"bbox": [
62.02479934692383,
183.73141479492188,
355.3616027832031,
205.3001708984375
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "W\u00e4hrend ich diese Zeilen schreibe, ist vom vermeintlichen Wonnemonat ",
"origin": [
62.02479934692383,
191.86016845703125
],
"bbox": [
62.02479934692383,
183.73141479492188,
354.2161865234375,
193.80267333984375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
62.02479934692383,
183.73141479492188,
354.2161865234375,
193.80267333984375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Mai wenig zu sp\u00fcren. Das Wetter erinnert eher an den Herbst. Allerdings ",
"origin": [
62.02479934692383,
203.357666015625
],
"bbox": [
62.02479934692383,
195.22891235351562,
355.3616027832031,
205.3001708984375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
62.02479934692383,
195.22891235351562,
355.3616027832031,
205.3001708984375
]
}
]
},
{
"number": 3,
"type": 0,
"bbox": [
50.956050872802734,
206.72640991210938,
354.5810852050781,
251.2901611328125
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "fehlt ein f\u00fcr mich wichtiger Bestandteil dieser Jahreszeit: die im Wald sprie-",
"origin": [
50.956050872802734,
214.85516357421875
],
"bbox": [
50.956050872802734,
206.72640991210938,
351.9472961425781,
216.79766845703125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.956050872802734,
206.72640991210938,
351.9472961425781,
216.79766845703125
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "\u00dfenden Speisepilze wie Steinpilz oder Pfifferling. Sie bereichern unseren ",
"origin": [
50.9560546875,
226.3526611328125
],
"bbox": [
50.9560546875,
218.22390747070312,
342.05816650390625,
228.295166015625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
218.22390747070312,
342.05816650390625,
228.295166015625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Essensplan ab August oder September, etwa als Teil von So\u00dfen oder Nudel-",
"origin": [
50.9560546875,
237.85015869140625
],
"bbox": [
50.9560546875,
229.72140502929688,
354.5810852050781,
239.79266357421875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
229.72140502929688,
354.5810852050781,
239.79266357421875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "gerichten. ",
"origin": [
50.9560546875,
249.34765625
],
"bbox": [
50.9560546875,
241.21890258789062,
93.7059326171875,
251.2901611328125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
241.21890258789062,
93.7059326171875,
251.2901611328125
]
}
]
},
{
"number": 4,
"type": 0,
"bbox": [
62.29605484008789,
252.71640014648438,
359.1162414550781,
262.78765869140625
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Da ich kein Pilzexperte bin, vermeide ich jedoch Experimente. In den Korb ",
"origin": [
62.29605484008789,
260.84515380859375
],
"bbox": [
62.29605484008789,
252.71640014648438,
359.1162414550781,
262.78765869140625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
62.29605484008789,
252.71640014648438,
359.1162414550781,
262.78765869140625
]
}
]
},
{
"number": 5,
"type": 0,
"bbox": [
50.9560546875,
264.2138977050781,
363.3572082519531,
320.275146484375
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "kommen nur sehr wenige Arten, die ich wirklich kenne. Alle anderen lasse ich ",
"origin": [
50.9560546875,
272.3426513671875
],
"bbox": [
50.9560546875,
264.2138977050781,
363.3572082519531,
274.28515625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
264.2138977050781,
363.3572082519531,
274.28515625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "gerne stehen und erfreue mich allenfalls an Form (wie bei der Stinkmorchel) ",
"origin": [
50.9560546875,
283.84014892578125
],
"bbox": [
50.9560546875,
275.7113952636719,
356.4150390625,
285.78265380859375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
275.7113952636719,
356.4150390625,
285.78265380859375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "oder Farbe (etwa bei den Fliegenpilzen). \u00dcberhaupt bilden Speisepilze nur ",
"origin": [
50.9560546875,
295.337646484375
],
"bbox": [
50.9560546875,
287.2088928222656,
345.9955749511719,
297.2801513671875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
287.2088928222656,
345.9955749511719,
297.2801513671875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "einen sehr kleinen Teil der Vielfalt in diesem dritten Reich der eukaryotischen ",
"origin": [
50.9560546875,
306.83514404296875
],
"bbox": [
50.9560546875,
298.7063903808594,
360.0507507324219,
308.77764892578125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
298.7063903808594,
360.0507507324219,
308.77764892578125
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Lebewesen. ",
"origin": [
50.9560546875,
318.3326416015625
],
"bbox": [
50.9560546875,
310.2038879394531,
101.7909164428711,
320.275146484375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
310.2038879394531,
101.7909164428711,
320.275146484375
]
}
]
},
{
"number": 6,
"type": 0,
"bbox": [
62.29605484008789,
321.7013854980469,
355.1341247558594,
331.77264404296875
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Un\u00fcberschaubar gro\u00df ist beispielsweise die Abteilung der Schlauchpilze, ",
"origin": [
62.29605484008789,
329.83013916015625
],
"bbox": [
62.29605484008789,
321.7013854980469,
355.1341247558594,
331.77264404296875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
62.29605484008789,
321.7013854980469,
355.1341247558594,
331.77264404296875
]
}
]
},
{
"number": 7,
"type": 0,
"bbox": [
50.9560546875,
333.1988830566406,
364.4510498046875,
469.74261474609375
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "zu der unter anderem viele Hefe- und Schimmelpilze geh\u00f6ren, welche ",
"origin": [
50.9560546875,
341.32763671875
],
"bbox": [
50.9560546875,
333.1988830566406,
331.2386779785156,
343.2701416015625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
333.1988830566406,
331.2386779785156,
343.2701416015625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Mensch, Tier oder Pflanze schaden k\u00f6nnen. Der dunklen Seite der Pilze ",
"origin": [
50.9560546875,
352.82513427734375
],
"bbox": [
50.9560546875,
344.6963806152344,
337.2332763671875,
354.76763916015625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
344.6963806152344,
337.2332763671875,
354.76763916015625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "widmen wir eine dreiteilige Serie, die mit dieser Ausgabe beginnt. Pilze k\u00f6n-",
"origin": [
50.9560546875,
364.3226318359375
],
"bbox": [
50.9560546875,
356.1938781738281,
353.47857666015625,
366.26513671875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
356.1938781738281,
353.47857666015625,
366.26513671875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "nen gef\u00e4hrliche Infektionskrankheiten bei Menschen ausl\u00f6sen und sich dabei ",
"origin": [
50.9560546875,
375.82012939453125
],
"bbox": [
50.9560546875,
367.6913757324219,
361.5155334472656,
377.76263427734375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
367.6913757324219,
361.5155334472656,
377.76263427734375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "in Krankenh\u00e4usern ausbreiten. Andere werden durch uns und den Welthandel ",
"origin": [
50.9560546875,
387.317626953125
],
"bbox": [
50.9560546875,
379.1888732910156,
364.4510498046875,
389.2601318359375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
379.1888732910156,
364.4510498046875,
389.2601318359375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "weltweit verschleppt und gef\u00e4hrden Tier- und Pflanzenarten in den betroffe-",
"origin": [
50.9560546875,
398.81512451171875
],
"bbox": [
50.9560546875,
390.6863708496094,
354.6072998046875,
400.75762939453125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
390.6863708496094,
354.6072998046875,
400.75762939453125
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "nen Gebieten, etwa Amphibien in S\u00fcdamerika, Flederm\u00e4use in Nordamerika ",
"origin": [
50.9560546875,
410.3126220703125
],
"bbox": [
50.9560546875,
402.1838684082031,
357.4117126464844,
412.255126953125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
402.1838684082031,
357.4117126464844,
412.255126953125
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "oder Eschen in Europa. Und eine dritte Gruppe zerst\u00f6rt ganze Ernten und ",
"origin": [
50.9560546875,
421.81011962890625
],
"bbox": [
50.9560546875,
413.6813659667969,
344.6148376464844,
423.75262451171875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
413.6813659667969,
344.6148376464844,
423.75262451171875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "bedroht dadurch Millionen Menschen mit Hunger. Mit diesen Sch\u00e4dlingen ",
"origin": [
50.9560546875,
433.3076171875
],
"bbox": [
50.9560546875,
425.1788635253906,
349.48773193359375,
435.2501220703125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
425.1788635253906,
349.48773193359375,
435.2501220703125
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "beginnt auch unsere Serie. Thomas Miedaner von der Universit\u00e4t Hohenheim ",
"origin": [
50.9560546875,
444.80511474609375
],
"bbox": [
50.9560546875,
436.6763610839844,
361.98358154296875,
446.74761962890625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
436.6763610839844,
361.98358154296875,
446.74761962890625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "erkl\u00e4rt ab S. 38, wie die Pilze Pflanzen befallen und wie wir sie bek\u00e4mpfen ",
"origin": [
50.9560546875,
456.3026123046875
],
"bbox": [
50.9560546875,
448.1738586425781,
348.68878173828125,
458.2451171875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
448.1738586425781,
348.68878173828125,
458.2451171875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "k\u00f6nnen, um Ernten zu sichern. ",
"origin": [
50.9560546875,
467.80010986328125
],
"bbox": [
50.9560546875,
459.6713562011719,
174.7528076171875,
469.74261474609375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
459.6713562011719,
174.7528076171875,
469.74261474609375
]
}
]
},
{
"number": 8,
"type": 0,
"bbox": [
62.29605484008789,
471.1688537597656,
361.9923095703125,
481.2401123046875
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Ebenfalls in diesem Heft starten wir noch eine zweite Serie, die sich einem ",
"origin": [
62.29605484008789,
479.297607421875
],
"bbox": [
62.29605484008789,
471.1688537597656,
361.9923095703125,
481.2401123046875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
62.29605484008789,
471.1688537597656,
361.9923095703125,
481.2401123046875
]
}
]
},
{
"number": 9,
"type": 0,
"bbox": [
50.9560546875,
482.6663513183594,
363.50958251953125,
561.7225952148438
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "besonderen Kosmos widmet: unserem Bewusstsein. Seit Jahrhunderten ver- ",
"origin": [
50.9560546875,
490.79510498046875
],
"bbox": [
50.9560546875,
482.6663513183594,
359.8135681152344,
492.73760986328125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
482.6663513183594,
359.8135681152344,
492.73760986328125
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "suchen wir es zu ergr\u00fcnden, und dennoch sind viele Fragen offengeblieben. ",
"origin": [
50.9560546875,
502.2926025390625
],
"bbox": [
50.9560546875,
494.1638488769531,
356.0458984375,
504.235107421875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
494.1638488769531,
356.0458984375,
504.235107421875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Nun sollen k\u00fcnstliche neuronale Netze \u2013 der \u00bbGeist in der Maschine\u00ab bildet ab ",
"origin": [
50.9560546875,
513.7901000976562
],
"bbox": [
50.9560546875,
505.6613464355469,
363.50958251953125,
515.7326049804688
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
505.6613464355469,
363.50958251953125,
515.7326049804688
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "S. 12 den Auftakt der dreiteiligen Serie \u2013 und die Mathematik uns bei der",
"origin": [
50.9560546875,
525.28759765625
],
"bbox": [
50.9560546875,
517.1588745117188,
339.90899658203125,
527.2301025390625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
517.1588745117188,
339.90899658203125,
527.2301025390625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Aufkl\u00e4rung unseres Geistes helfen. Und auch Experimente spielen weiterhin",
"origin": [
50.9560546875,
536.7850952148438
],
"bbox": [
50.9560546875,
528.6563720703125,
354.7044677734375,
538.7276000976562
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
528.6563720703125,
354.7044677734375,
538.7276000976562
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "eine Rolle. Eines davon will zwei der prominentesten Modelle auf den Pr\u00fcf-",
"origin": [
50.9560546875,
548.2825927734375
],
"bbox": [
50.9560546875,
540.1538696289062,
349.18231201171875,
550.22509765625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
540.1538696289062,
349.18231201171875,
550.22509765625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "stand stellen \u2013 und diese entweder best\u00e4tigen oder widerlegen.",
"origin": [
50.9560546875,
559.7800903320312
],
"bbox": [
50.9560546875,
551.6513671875,
304.5346374511719,
561.7225952148438
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
551.6513671875,
304.5346374511719,
561.7225952148438
]
}
]
},
{
"number": 10,
"type": 0,
"bbox": [
50.9560546875,
574.6550903320312,
96.47882080078125,
584.726318359375
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Herzlich Ihr",
"origin": [
50.9560546875,
582.7838134765625
],
"bbox": [
50.9560546875,
574.6550903320312,
96.47882080078125,
584.726318359375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
50.9560546875,
574.6550903320312,
96.47882080078125,
584.726318359375
]
}
]
},
{
"number": 11,
"type": 0,
"bbox": [
411.0242004394531,
51.734458923339844,
501.3325500488281,
65.42646026611328
],
"lines": [
{
"spans": [
{
"size": 12.0,
"flags": 4,
"font": "UniversNextPro-LightCond",
"color": 0,
"ascender": 0.9269999861717224,
"descender": -0.21400000154972076,
"text": "IN DIESER AUSGABE",
"origin": [
411.0242004394531,
62.85845947265625
],
"bbox": [
411.0242004394531,
51.734458923339844,
501.3325500488281,
65.42646026611328
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0242004394531,
51.734458923339844,
501.3325500488281,
65.42646026611328
]
}
]
},
{
"number": 12,
"type": 0,
"bbox": [
140.31619262695312,
672.9744873046875,
215.28738403320312,
687.0385131835938
],
"lines": [
{
"spans": [
{
"size": 12.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "NEU AM KIOSK!",
"origin": [
140.31619262695312,
684.3624877929688
],
"bbox": [
140.31619262695312,
672.9744873046875,
215.28738403320312,
687.0385131835938
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
140.31619262695312,
672.9744873046875,
215.28738403320312,
687.0385131835938
]
}
]
},
{
"number": 13,
"type": 0,
"bbox": [
140.3148956298828,
689.556396484375,
358.1094970703125,
734.295166015625
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 0,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "Spektrum",
"origin": [
140.3148956298828,
697.8601684570312
],
"bbox": [
140.3148956298828,
689.556396484375,
171.02561950683594,
699.8114013671875
]
},
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": " ",
"origin": [
170.66864013671875,
697.8601684570312
],
"bbox": [
170.66864013671875,
689.7314453125,
173.20614624023438,
699.8026733398438
]
},
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Cond",
"color": 0,
"ascender": 0.9359999895095825,
"descender": -0.21400000154972076,
"text": "SPEZIAL ",
"origin": [
173.07489013671875,
697.8601684570312
],
"bbox": [
173.07489013671875,
689.670166015625,
202.37429809570312,
699.732666015625
]
},
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Physik \u2013 Mathematik \u2013 Technik 2.21 ",
"origin": [
202.2473907470703,
697.8601684570312
],
"bbox": [
202.2473907470703,
689.7314453125,
346.1961975097656,
699.8026733398438
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
140.3148956298828,
689.556396484375,
346.1961975097656,
699.8114013671875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "zeigt, wie sich die komplexen Systeme unserer Um-",
"origin": [
141.31239318847656,
709.357666015625
],
"bbox": [
141.31239318847656,
701.2289428710938,
347.72491455078125,
711.3001708984375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
141.31239318847656,
701.2289428710938,
347.72491455078125,
711.3001708984375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "welt in Computermodellen nachstellen lassen und ",
"origin": [
141.31239318847656,
720.8551635742188
],
"bbox": [
141.31239318847656,
712.7264404296875,
343.4996337890625,
722.7976684570312
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
141.31239318847656,
712.7264404296875,
343.4996337890625,
722.7976684570312
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "damit etwa zuverl\u00e4ssigere Wetterprognosen erlauben.",
"origin": [
141.31239318847656,
732.3526611328125
],
"bbox": [
141.31239318847656,
724.2239379882812,
358.1094970703125,
734.295166015625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
141.31239318847656,
724.2239379882812,
358.1094970703125,
734.295166015625
]
}
]
},
{
"number": 14,
"type": 0,
"bbox": [
411.0046081542969,
224.47216796875,
499.610107421875,
250.03616333007812
],
"lines": [
{
"spans": [
{
"size": 12.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "PATRICK KRAUSS, ",
"origin": [
411.0046081542969,
235.86016845703125
],
"bbox": [
411.0046081542969,
224.47216796875,
499.610107421875,
238.53616333007812
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
224.47216796875,
499.610107421875,
238.53616333007812
]
},
{
"spans": [
{
"size": 12.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "ANDREAS MAIER ",
"origin": [
411.0046081542969,
247.36016845703125
],
"bbox": [
411.0046081542969,
235.97216796875,
495.9274597167969,
250.03616333007812
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
235.97216796875,
495.9274597167969,
250.03616333007812
]
}
]
},
{
"number": 15,
"type": 0,
"bbox": [
411.00457763671875,
252.73141479492188,
545.2821044921875,
308.79266357421875
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Ob auch Maschinen ein Bewusst-",
"origin": [
411.0046081542969,
260.86016845703125
],
"bbox": [
411.0046081542969,
252.73141479492188,
545.2821044921875,
262.80267333984375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
252.73141479492188,
545.2821044921875,
262.80267333984375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "sein entwickeln k\u00f6nnen, ergr\u00fcn-",
"origin": [
411.00457763671875,
272.357666015625
],
"bbox": [
411.00457763671875,
264.2289123535156,
539.4808349609375,
274.3001708984375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.00457763671875,
264.2289123535156,
539.4808349609375,
274.3001708984375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "den der Neurowissenschaftler ",
"origin": [
411.00457763671875,
283.85516357421875
],
"bbox": [
411.00457763671875,
275.7264099121094,
533.7821655273438,
285.79766845703125
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.00457763671875,
275.7264099121094,
533.7821655273438,
285.79766845703125
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Patrick Krau\u00df und der Informati-",
"origin": [
411.00457763671875,
295.3526611328125
],
"bbox": [
411.00457763671875,
287.2239074707031,
539.7171020507812,
297.295166015625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.00457763671875,
287.2239074707031,
539.7171020507812,
297.295166015625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "ker Andreas Maier ab S. 12.",
"origin": [
411.00457763671875,
306.85015869140625
],
"bbox": [
411.00457763671875,
298.7214050292969,
520.12744140625,
308.79266357421875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.00457763671875,
298.7214050292969,
520.12744140625,
308.79266357421875
]
}
]
},
{
"number": 16,
"type": 0,
"bbox": [
411.0046081542969,
592.47216796875,
524.23046875,
618.0322265625
],
"lines": [
{
"spans": [
{
"size": 12.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "ANNE-C\u00c9CILE ORGERIE, ",
"origin": [
411.0046081542969,
603.8601684570312
],
"bbox": [
411.0046081542969,
592.47216796875,
524.23046875,
606.5361938476562
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
592.47216796875,
524.23046875,
606.5361938476562
]
},
{
"spans": [
{
"size": 12.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "LAURENT LEF\u00c8VRE",
"origin": [
411.0046081542969,
615.356201171875
],
"bbox": [
411.0046081542969,
603.9682006835938,
500.13458251953125,
618.0322265625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
603.9682006835938,
500.13458251953125,
618.0322265625
]
}
]
},
{
"number": 17,
"type": 0,
"bbox": [
411.0046081542969,
620.7314453125,
545.0808715820312,
676.7926635742188
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Die beiden franz\u00f6sischen For-",
"origin": [
411.0046081542969,
628.8601684570312
],
"bbox": [
411.0046081542969,
620.7314453125,
529.3833618164062,
630.8026733398438
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
620.7314453125,
529.3833618164062,
630.8026733398438
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "scher widmen sich ab S. 76 der ",
"origin": [
411.0046081542969,
640.357666015625
],
"bbox": [
411.0046081542969,
632.2289428710938,
538.6320190429688,
642.3001708984375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
632.2289428710938,
538.6320190429688,
642.3001708984375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Frage, welche \u00f6kologischen ",
"origin": [
411.0046081542969,
651.8551635742188
],
"bbox": [
411.0046081542969,
643.7264404296875,
525.0968017578125,
653.7976684570312
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
643.7264404296875,
525.0968017578125,
653.7976684570312
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Folgen die fortschreitende Digita-",
"origin": [
411.0046081542969,
663.3526611328125
],
"bbox": [
411.0046081542969,
655.2239379882812,
545.0808715820312,
665.295166015625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
655.2239379882812,
545.0808715820312,
665.295166015625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "lisierung mit sich bringt.",
"origin": [
411.0046081542969,
674.8501586914062
],
"bbox": [
411.0046081542969,
666.721435546875,
507.412109375,
676.7926635742188
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
666.721435546875,
507.412109375,
676.7926635742188
]
}
]
},
{
"number": 18,
"type": 0,
"bbox": [
411.0046081542969,
408.4721984863281,
503.6938171386719,
434.03619384765625
],
"lines": [
{
"spans": [
{
"size": 12.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "BERNHARD WOOD, ",
"origin": [
411.0046081542969,
419.8601989746094
],
"bbox": [
411.0046081542969,
408.4721984863281,
503.6938171386719,
422.53619384765625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
408.4721984863281,
503.6938171386719,
422.53619384765625
]
},
{
"spans": [
{
"size": 12.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "ALEXIS WILLIAMS",
"origin": [
411.0046081542969,
431.3601989746094
],
"bbox": [
411.0046081542969,
419.9721984863281,
498.00225830078125,
434.03619384765625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0046081542969,
419.9721984863281,
498.00225830078125,
434.03619384765625
]
}
]
},
{
"number": 19,
"type": 0,
"bbox": [
411.0049743652344,
436.7312316894531,
546.5139770507812,
492.5574951171875
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Der Pal\u00e4oanthropologe und seine ",
"origin": [
411.0050048828125,
444.8599853515625
],
"bbox": [
411.0050048828125,
436.7312316894531,
546.5139770507812,
446.802490234375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0050048828125,
436.7312316894531,
546.5139770507812,
446.802490234375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Doktorandin rekonstruieren das ",
"origin": [
411.0050048828125,
456.3599853515625
],
"bbox": [
411.0050048828125,
448.2312316894531,
539.9134521484375,
458.302490234375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0050048828125,
448.2312316894531,
539.9134521484375,
458.302490234375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "einstige Leben des r\u00e4tselhaften ",
"origin": [
411.0049743652344,
467.8599853515625
],
"bbox": [
411.0049743652344,
459.7312316894531,
538.4785766601562,
469.802490234375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0049743652344,
459.7312316894531,
538.4785766601562,
469.802490234375
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "\u00bbNussknackermenschen\u00ab ",
"origin": [
411.0570068359375,
479.114990234375
],
"bbox": [
411.0570068359375,
470.9862365722656,
515.9761352539062,
481.0574951171875
]
},
{
"size": 8.75,
"flags": 6,
"font": "UniversNextPro-Italic",
"color": 0,
"ascender": 0.9490000009536743,
"descender": -0.21400000154972076,
"text": "Paran-",
"origin": [
515.7671508789062,
479.114990234375
],
"bbox": [
515.7671508789062,
470.8112487792969,
542.5921630859375,
480.98748779296875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0570068359375,
470.8112487792969,
542.5921630859375,
481.0574951171875
]
},
{
"spans": [
{
"size": 8.75,
"flags": 6,
"font": "UniversNextPro-Italic",
"color": 0,
"ascender": 0.9490000009536743,
"descender": -0.21400000154972076,
"text": "thropus boisei",
"origin": [
411.0050048828125,
490.614990234375
],
"bbox": [
411.0050048828125,
482.3112487792969,
468.2322998046875,
492.48748779296875
]
},
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": " (S. 30).",
"origin": [
468.3193054199219,
490.614990234375
],
"bbox": [
468.3193054199219,
482.4862365722656,
497.2737731933594,
492.5574951171875
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
411.0050048828125,
482.3112487792969,
497.2737731933594,
492.5574951171875
]
}
]
},
{
"number": 20,
"type": 0,
"bbox": [
137.12600708007812,
48.90245819091797,
338.1016540527344,
135.09446716308594
],
"lines": [
{
"spans": [
{
"size": 28.0,
"flags": 4,
"font": "UniversNextPro-LightCond",
"color": 0,
"ascender": 0.9269999861717224,
"descender": -0.21400000154972076,
"text": "EDITORIAL",
"origin": [
137.12600708007812,
74.85845947265625
],
"bbox": [
137.12600708007812,
48.90245819091797,
240.1044158935547,
80.85045623779297
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
137.12600708007812,
48.90245819091797,
240.1044158935547,
80.85045623779297
]
},
{
"spans": [
{
"size": 28.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "DIE DUNKLE SEITE ",
"origin": [
137.12600708007812,
101.85845947265625
],
"bbox": [
137.12600708007812,
75.28646087646484,
338.1016540527344,
108.10246276855469
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
137.12600708007812,
75.28646087646484,
338.1016540527344,
108.10246276855469
]
},
{
"spans": [
{
"size": 28.0,
"flags": 20,
"font": "UniversNextPro-BoldCond",
"color": 18293,
"ascender": 0.9490000009536743,
"descender": -0.22300000488758087,
"text": "DER PILZE",
"origin": [
137.12600708007812,
128.8504638671875
],
"bbox": [
137.12600708007812,
102.2784652709961,
246.24761962890625,
135.09446716308594
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
137.12600708007812,
102.2784652709961,
246.24761962890625,
135.09446716308594
]
}
]
},
{
"number": 21,
"type": 0,
"bbox": [
137.12600708007812,
137.89901733398438,
273.66534423828125,
159.4677734375
],
"lines": [
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 0,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "Daniel Lingenh\u00f6hl, Chefredakteur ",
"origin": [
137.12600708007812,
146.02777099609375
],
"bbox": [
137.12600708007812,
137.89901733398438,
273.66534423828125,
147.97027587890625
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
137.12600708007812,
137.89901733398438,
273.66534423828125,
147.97027587890625
]
},
{
"spans": [
{
"size": 8.75,
"flags": 4,
"font": "UniversNextPro-Regular",
"color": 9152454,
"ascender": 0.9290000200271606,
"descender": -0.22200000286102295,
"text": "lingenhoehl@spektrum.de",
"origin": [
137.12600708007812,
157.5252685546875
],
"bbox": [
137.12600708007812,
149.39651489257812,
242.86016845703125,
159.4677734375
]
}
],
"wmode": 0,
"dir": [
1.0,
0.0
],
"bbox": [
137.12600708007812,
149.39651489257812,
242.86016845703125,
159.4677734375
]
}
]
}
]
\ No newline at end of file
# "scihub/scihub_37000000/libgen.scimag37068000-37068999.zip_10.1080/0015587X.1936.9718622" 新的特殊扫描版,需要更新检测规则
# demo_meta_scan(book_name="zlib/zlib_21814955")
# demo_meta_scan(book_name="zlib/zlib_22115997") # meta_scan
# demo_classify_by_type("scihub/scihub_04600000/libgen.scimag04690000-04690999.zip_10.1016/s0378-4347(98)00269-2") # classify
# demo_parse_pdf("scihub/scihub_28400000/libgen.scimag28413000-28413999.zip_10.2307/1316224")
# demo_parse_pdf(book_name="scihub/scihub_65300000/libgen.scimag65336000-65336999.zip_10.1021/acs.jcim.7b00151") # parse_pdf
# demo_parse_pdf(book_name="scihub/scihub_76500000/libgen.scimag76506000-76506999.zip_10.1016/j.nanoen.2019.103943") # parse_pdf
# demo_parse_pdf(book_name="zlib/zlib_22115997") # parse_pdf
# demo_parse_pdf(book_name="zlib/zlib_21814957") # parse_pdf
# demo_parse_pdf(book_name="zlib/zlib_21929367",start_page_id=48) # parse_pdf
# demo_save_tables(book_name="scihub/scihub_17200000/libgen.scimag17236000-17236999.zip_10.1016/s0002-9440(10)65013-4") # parse_pdf
# demo_parse_pdf(book_name="scihub/scihub_50200000/libgen.scimag50226000-50226999.zip_10.0000/cyberleninka.ru/article/n/chislennoe-modelirovanie-udarnogo-vozdeystviya-vysokoskorostnoy-strui-na-tverdoe-telo")
# 无限循环爆内存
# demo_parse_pdf(book_name="scihub/scihub_10400000/libgen.scimag10461000-10461999.zip_10.1038/4671055a")
# demo_parse_pdf(book_name="scihub/scihub_36400000/libgen.scimag36418000-36418999.zip_10.1038/scientificamerican0115-82b")
# 10000页的pdf,中间态dict过大
# demo_parse_pdf(book_name="zlib/zlib_17498382")
# demo_parse_pdf(book_name="zlib/zlib_22006221")
# footnote
# demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09782000-09782999.zip_10.1111/j.1540-627x.2006.00176.x")
# demo_parse_pdf(book_name="scihub/scihub_17400000/libgen.scimag17488000-17488999.zip_10.1016/s0043-1354(02)00581-x")
# demo_parse_pdf(book_name="scihub/scihub_17000000/libgen.scimag17000000-17000999.zip_10.1016/j.pain.2004.06.005")
# demo_parse_pdf(book_name="scihub/scihub_46700000/libgen.scimag46727000-46727999.zip_10.2174/157341210791202627")
# demo_parse_pdf(book_name="scihub/scihub_68900000/libgen.scimag68948000-68948999.zip_10.1002/uog.18760")
# demo_parse_pdf(book_name="scihub/scihub_64600000/libgen.scimag64628000-64628999.zip_10.3892/mmr.2017.6343")
# demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scimag47212000-47212999.zip_10.7589/0090-3558-40.3.579")
# demo_parse_pdf(book_name="scihub/scihub_30400000/libgen.scimag30438000-30438999.zip_10.1021/ja048851k")
# demo_parse_pdf(book_name="scihub/scihub_43600000/libgen.scimag43628000-43628999.zip_10.1093/toxsci/kfi111")
# demo_parse_pdf(book_name="scihub/scihub_14000000/libgen.scimag14081000-14081999.zip_10.1016/s0923-4748(00)00034-5")
# demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scimag47212000-47212999.zip_10.7589/0090-3558-40.3.579")
demo_parse_pdf(book_name="scihub/scihub_43200000/libgen.scimag43208000-43208999.zip_10.1645/0022-3395(2000)086[0275:podsri]2.0.co;2")
# demo_parse_pdf(book_name="scihub/scihub_55700000/libgen.scimag55717000-55717999.zip_10.1007/s10067-016-3303-0")
# 内容缺失
# demo_parse_pdf(book_name="scihub/scihub_76900000/libgen.scimag76990000-76990999.zip_10.1145/3314111.3319829")
# demo_parse_pdf(book_name="scihub/scihub_84600000/libgen.scimag84652000-84652999.zip_10.1163/1876312x-00001010")
# block line乱序
# demo_parse_pdf(book_name="scihub/scihub_04800000/libgen.scimag04807000-04807999.zip_10.1016/s0927-7765(97)00029-5")
# demo_parse_pdf(book_name="scihub/scihub_37000000/libgen.scimag37007000-37007999.zip_10.0000/docserver.ingentaconnect.com/generic-24bf2a7237e7")
# demo_parse_pdf(book_name="scihub/scihub_44700000/libgen.scimag44733000-44733999.zip_10.1063/1.3631048")
# demo_parse_pdf(book_name="scihub/scihub_21100000/libgen.scimag21124000-21124999.zip_10.1080/10447318.2002.9669130")
# demo_parse_pdf(book_name="scihub/scihub_80100000/libgen.scimag80185000-80185999.zip_10.1353/sib.0.0003")
# footnote
demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09782000-09782999.zip_10.1111/j.1540-627x.2006.00176.x")
demo_parse_pdf(book_name="scihub/scihub_17400000/libgen.scimag17488000-17488999.zip_10.1016/s0043-1354(02)00581-x")
demo_parse_pdf(book_name="scihub/scihub_17000000/libgen.scimag17000000-17000999.zip_10.1016/j.pain.2004.06.005")
demo_parse_pdf(book_name="scihub/scihub_46700000/libgen.scimag46727000-46727999.zip_10.2174/157341210791202627")
demo_parse_pdf(book_name="scihub/scihub_64600000/libgen.scimag64628000-64628999.zip_10.3892/mmr.2017.6343")
demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scimag47212000-47212999.zip_10.7589/0090-3558-40.3.579")
demo_parse_pdf(book_name="scihub/scihub_43600000/libgen.scimag43628000-43628999.zip_10.1093/toxsci/kfi111")
demo_parse_pdf(book_name="scihub/scihub_14000000/libgen.scimag14081000-14081999.zip_10.1016/s0923-4748(00)00034-5")
demo_parse_pdf(book_name="scihub/scihub_55700000/libgen.scimag55717000-55717999.zip_10.1007/s10067-016-3303-0")
demo_parse_pdf(book_name="scihub/scihub_86500000/libgen.scimag86560000-86560999.zip_10.1007/s10995-021-03207-2")
demo_parse_pdf(book_name="scihub/scihub_42100000/libgen.scimag42162000-42162999.zip_10.1093/notesj/gjm116")
demo_parse_pdf(book_name="scihub/scihub_07500000/libgen.scimag07500000-07500999.zip_10.1007/s00412-005-0007-7")
demo_parse_pdf(book_name="scihub/scihub_50800000/libgen.scimag50827000-50827999.zip_10.1210/jc.2015-4251")
demo_parse_pdf(book_name="scihub/scihub_07500000/libgen.scimag07537000-07537999.zip_10.1007/s004320050323")
demo_parse_pdf(book_name="scihub/scihub_24600000/libgen.scimag24665000-24665999.zip_10.1016/S0387-7604(89)80007-5")
demo_parse_pdf(book_name="scihub/scihub_76200000/libgen.scimag76297000-76297999.zip_10.4018/jehmc.2011040101")
demo_parse_pdf(book_name="scihub/scihub_29400000/libgen.scimag29456000-29456999.zip_10.1177/0883911505049656")
demo_parse_pdf(book_name="scihub/scihub_30200000/libgen.scimag30263000-30263999.zip_10.1081/scc-200036639")
demo_parse_pdf(book_name="scihub/scihub_71200000/libgen.scimag71224000-71224999.zip_10.1038/s41396-018-0231-9")
demo_parse_pdf(book_name="scihub/scihub_30100000/libgen.scimag30175000-30175999.zip_10.1300/j035v17n04_03")
demo_parse_pdf(book_name="scihub/scihub_18900000/libgen.scimag18981000-18981999.zip_10.1016/j.neuroimage.2006.06.030")
demo_parse_pdf(book_name="scihub/scihub_53100000/libgen.scimag53119000-53119999.zip_10.1097/01.npt.0000282350.63993.7a")
demo_parse_pdf(book_name="scihub/scihub_10000000/libgen.scimag10071000-10071999.zip_10.1111/j.1750-8606.2011.00190.x")
demo_parse_pdf(book_name="scihub/scihub_64000000/libgen.scimag64030000-64030999.zip_10.1080/1612197X.2017.1292302")
demo_parse_pdf(book_name="scihub/scihub_86000000/libgen.scimag86085000-86085999.zip_10.1016/j.enzmictec.2020.109742")
demo_parse_pdf(book_name="scihub/scihub_51000000/libgen.scimag51049000-51049999.zip_10.1117/12.2227997")
demo_parse_pdf(book_name="scihub/scihub_43700000/libgen.scimag43768000-43768999.zip_10.1063/1.4895640")
demo_parse_pdf(book_name="scihub/scihub_05000000/libgen.scimag05036000-05036999.zip_10.1046/j.1365-2036.2000.00699.x")
#色块内文本
# demo_parse_pdf(book_name="scihub/scihub_87200000/libgen.scimag87202000-87202999.zip_10.1080/10220461.2021.1894971")
# 0226
demo_parse_pdf(book_name="scihub/scihub_76100000/libgen.scimag76174000-76174999.zip_10.1016/j.annemergmed.2019.01.040")
demo_parse_pdf(book_name="scihub/scihub_37400000/libgen.scimag37409000-37409999.zip_10.1177/0961203307085251")
demo_parse_pdf(book_name="scihub/scihub_26100000/libgen.scimag26107000-26107999.zip_10.1002/ajim.22195")
demo_parse_pdf(book_name="scihub/scihub_13100000/libgen.scimag13186000-13186999.zip_10.1016/j.brainres.2006.02.013")
demo_parse_pdf(book_name="scihub/scihub_75200000/libgen.scimag75297000-75297999.zip_10.1177/00393207160461-203")
demo_parse_pdf(book_name="scihub/scihub_34200000/libgen.scimag34255000-34255999.zip_10.1093/ojls/gqi025")
demo_parse_pdf(book_name="scihub/scihub_11300000/libgen.scimag11331000-11331999.zip_10.1208/s12249-011-9638-6")
\ No newline at end of file
import io
import json
import os
import boto3
from botocore.config import Config
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
from loguru import logger
test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
def get_test_pdf_json(book_name):
json_path = join_path(json_dump_path, book_name + ".json")
s3_config = get_s3_config_dict(json_path)
file_content = read_file(json_path, s3_config)
json_str = file_content.decode('utf-8')
json_object = json.loads(json_str)
return json_object
def read_test_file(book_name):
test_pdf_path = join_path(test_pdf_dir_path, book_name + ".pdf")
s3_config = get_s3_config_dict(test_pdf_path)
try:
file_content = read_file(test_pdf_path, s3_config)
return file_content
except Exception as e:
if "NoSuchKey" in str(e):
logger.warning("File not found in test_pdf_path. Downloading from orig_s3_pdf_path.")
try:
json_object = get_test_pdf_json(book_name)
orig_s3_pdf_path = json_object.get('file_location')
s3_config = get_s3_config_dict(orig_s3_pdf_path)
file_content = read_file(orig_s3_pdf_path, s3_config)
s3_client = get_s3_client(test_pdf_path)
bucket_name, bucket_key = parse_bucket_key(test_pdf_path)
file_obj = io.BytesIO(file_content)
s3_client.upload_fileobj(file_obj, bucket_name, bucket_key)
return file_content
except Exception as e:
logger.exception(e)
else:
logger.exception(e)
def get_docs_from_test_pdf(book_name):
file_content = read_test_file(book_name)
return fitz.open("pdf", file_content)
def get_test_json_data(directory_path, json_file_name):
with open(os.path.join(directory_path, json_file_name), "r", encoding='utf-8') as f:
test_data = json.load(f)
return test_data
def get_s3_client(path):
s3_config = get_s3_config_dict(path)
try:
return boto3.client(
"s3",
aws_access_key_id=s3_config["ak"],
aws_secret_access_key=s3_config["sk"],
endpoint_url=s3_config["endpoint"],
config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8, "mode": "standard"}),
)
except:
# older boto3 do not support retries.mode param.
return boto3.client(
"s3",
aws_access_key_id=s3_config["ak"],
aws_secret_access_key=s3_config["sk"],
endpoint_url=s3_config["endpoint"],
config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8}),
)
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
import os
import shutil
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
def test_filebased_reader_writer():
unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
sub_dir = os.path.join(unitest_dir, 'sub')
abs_fn = os.path.join(unitest_dir, 'abspath.txt')
os.makedirs(sub_dir, exist_ok=True)
writer = FileBasedDataWriter(sub_dir)
reader = FileBasedDataReader(sub_dir)
writer.write('test.txt', b'hello world')
assert reader.read('test.txt') == b'hello world'
writer.write(abs_fn, b'hello world')
assert reader.read(abs_fn) == b'hello world'
shutil.rmtree(unitest_dir)
import json
import os
import fitz
import pytest
from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
writer = MultiBucketS3DataWriter(bucket, s3configs)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
prefix = 'meta-index'
reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3DataReader('', bucket, ak, sk, endpoint_url)
writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
prefix = 'meta-index'
reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.io.s3 import S3Reader, S3Writer
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_reader():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert len(bits) > 0
bits = reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
566,
713,
)
assert len(json.loads(bits)) > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_writer():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
test_fn = 'unittest/io/test.jsonl'
writer.write(test_fn, '123'.encode())
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(test_fn)
assert bits.decode() == '123'
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset():
with open('tests/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read()
datasets = PymuDocDataset(bits)
assert len(datasets) > 0
assert datasets.get_page(0).get_page_info().h > 100
def test_imagedataset():
with open('tests/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read()
datasets = ImageDataset(bits)
assert len(datasets) == 1
assert datasets.get_page(0).get_page_info().w > 100
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment