para_test_pdf_ids.ini 14.1 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# """
# Para Test Cases of Sci-Hub
# """

demo_parse_pdf(book_name="scihub/scihub_04600000/libgen.scimag04690000-04690999.zip_10.1016/s0378-4347(98)00269-2") # layout
demo_parse_pdf(book_name="scihub/scihub_18500000/libgen.scimag18539000-18539999.zip_10.1039/c2ob27232f")            # layout
demo_parse_pdf(book_name="scihub/scihub_28400000/libgen.scimag28413000-28413999.zip_10.2307/1316224")               # layout
demo_parse_pdf(book_name="scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178")             # 标题,Done。Layout,TODO。公式替换,TODO。

# """
# FOR TEST : 0115~0119 数据测试
# https://aicarrier.feishu.cn/wiki/PIBKwMja9iBySpkV5Ywcy5zdnFE
demo_parse_pdf(book_name="scihub/scihub_18000000/libgen.scimag18064000-18064999.zip_10.1016/s1385-299x(98)00049-x") # 标题序列。二级标题待识别,TODO。
demo_parse_pdf(book_name="scihub/scihub_24400000/libgen.scimag24401000-24401999.zip_10.1016/j.toxicon.2014.02.018") # 标题序列
demo_parse_pdf(book_name="scihub/scihub_29200000/libgen.scimag29285000-29285999.zip_10.1016/j.tiv.2014.09.004")     # 标题序列

demo_parse_pdf(book_name="scihub/scihub_87000000/libgen.scimag87032000-87032999.zip_10.3390/rs13010076")                 # 标题,问题较多,待处理
demo_parse_pdf(book_name="scihub/scihub_75300000/libgen.scimag75322000-75322999.zip_10.7748/ns2007.05.21.34.35.c4550")   # 标题、段落。DONE
demo_parse_pdf(book_name="scihub/scihub_11400000/libgen.scimag11451000-11451999.zip_10.1017/s0009838811000231")          # 标题
demo_parse_pdf(book_name="scihub/scihub_23000000/libgen.scimag23039000-23039999.zip_10.1007/s10897-013-9674-3")          # 标题
demo_parse_pdf(book_name="scihub/scihub_23400000/libgen.scimag23485000-23485999.zip_10.1016/S0891-5849(13)00133-0")      # 标题
demo_parse_pdf(book_name="scihub/scihub_36500000/libgen.scimag36588000-36588999.zip_10.1109/adprl.2014.7010623")         # 标题
demo_parse_pdf(book_name="scihub/scihub_47300000/libgen.scimag47374000-47374999.zip_10.1016/B978-0-12-410502-7.00005-3") # 标题
demo_parse_pdf(book_name="scihub/scihub_80300000/libgen.scimag80371000-80371999.zip_10.1097/MJT.0000000000001112")       # 标题
demo_parse_pdf(book_name="scihub/scihub_82500000/libgen.scimag82595000-82595999.zip_10.1016/B978-0-12-817752-5.00003-2") # 标题
demo_parse_pdf(book_name="scihub/scihub_01900000/libgen.scimag01914000-01914999.zip_10.1006/ndsh.1998.0025")             # 标题。240 pages。DONES
demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09714000-09714999.zip_10.1111/j.1524-4733.2009.00592_2.x") # 标题。240 pages

demo_parse_pdf(book_name="scihub/scihub_87300000/libgen.scimag87385000-87385999.zip_10.1515/pthp-2020-0016")        # 段落切割
demo_parse_pdf(book_name="scihub/scihub_23000000/libgen.scimag23003000-23003999.zip_10.1007/s12264-013-1404-1")     # 段落,
demo_parse_pdf(book_name="scihub/scihub_37000000/libgen.scimag37068000-37068999.zip_10.1080/0015587X.1936.9718622") # 跨页段落合并。第二页文字没有出现。
demo_parse_pdf(book_name="scihub/scihub_84500000/libgen.scimag84570000-84570999.zip_10.1017/S0033583520000086")     # 分段
# END TEST CASES: 0115~0119 数据测试
# """

demo_parse_pdf(book_name="scihub/scihub_18000000/libgen.scimag18064000-18064999.zip_10.1016/s1385-299x(98)00049-x") # 标题序列。二级标题待识别,TODO。
demo_parse_pdf(book_name="scihub/scihub_24400000/libgen.scimag24401000-24401999.zip_10.1016/j.toxicon.2014.02.018") # 标题序列
demo_parse_pdf(book_name="scihub/scihub_29200000/libgen.scimag29285000-29285999.zip_10.1016/j.tiv.2014.09.004")     # 标题序列
demo_parse_pdf(book_name="scihub/scihub_50900000/libgen.scimag50902000-50902999.zip_10.1007/s12274-016-1035-8")     # 标题序列

demo_parse_pdf(book_name="scihub/scihub_87000000/libgen.scimag87032000-87032999.zip_10.3390/rs13010076") # 标题,问题较多,待处理
demo_parse_pdf(book_name="scihub/scihub_87000000/libgen.scimag87032000-87032999.zip_10.3390/rs13010076") # 标题,问题较多,待处理

demo_parse_pdf(book_name="scihub/scihub_62600000/libgen.scimag62633000-62633999.zip_10.2169/internalmedicine.54.2755")   # 标题、段落。DONE
demo_parse_pdf(book_name="scihub/scihub_75300000/libgen.scimag75322000-75322999.zip_10.7748/ns2007.05.21.34.35.c4550")   # 标题、段落。DONE
demo_parse_pdf(book_name="scihub/scihub_11400000/libgen.scimag11451000-11451999.zip_10.1017/s0009838811000231")          # 标题
demo_parse_pdf(book_name="scihub/scihub_23000000/libgen.scimag23039000-23039999.zip_10.1007/s10897-013-9674-3")          # 标题
demo_parse_pdf(book_name="scihub/scihub_23400000/libgen.scimag23485000-23485999.zip_10.1016/S0891-5849(13)00133-0")      # 标题
demo_parse_pdf(book_name="scihub/scihub_36500000/libgen.scimag36588000-36588999.zip_10.1109/adprl.2014.7010623")         # 标题
demo_parse_pdf(book_name="scihub/scihub_47300000/libgen.scimag47374000-47374999.zip_10.1016/B978-0-12-410502-7.00005-3") # 标题
demo_parse_pdf(book_name="scihub/scihub_80300000/libgen.scimag80371000-80371999.zip_10.1097/MJT.0000000000001112")       # 标题
demo_parse_pdf(book_name="scihub/scihub_82500000/libgen.scimag82595000-82595999.zip_10.1016/B978-0-12-817752-5.00003-2") # 标题
# demo_parse_pdf(book_name="scihub/scihub_01900000/libgen.scimag01914000-01914999.zip_10.1006/ndsh.1998.0025") # 标题。240 pages。DONES
# demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09714000-09714999.zip_10.1111/j.1524-4733.2009.00592_2.x") # 标题。240 pages

demo_parse_pdf(book_name="scihub/scihub_87300000/libgen.scimag87385000-87385999.zip_10.1515/pthp-2020-0016")        # 段落切割
demo_parse_pdf(book_name="scihub/scihub_23000000/libgen.scimag23003000-23003999.zip_10.1007/s12264-013-1404-1")     # 段落,
demo_parse_pdf(book_name="scihub/scihub_37000000/libgen.scimag37068000-37068999.zip_10.1080/0015587X.1936.9718622") # 跨页段落合并。第二页文字没有出现。
demo_parse_pdf(book_name="scihub/scihub_84500000/libgen.scimag84570000-84570999.zip_10.1017/S0033583520000086")     # 分段

demo_parse_pdf(book_name="scihub/scihub_31800000/libgen.scimag31824000-31824999.zip_10.1109/med.2012.6265668") # 复杂公式
demo_parse_pdf(book_name="scihub/scihub_36800000/libgen.scimag36890000-36890999.zip_10.2514/1.4659")           # 标题,公式
demo_parse_pdf(book_name="scihub/scihub_58900000/libgen.scimag58981000-58981999.zip_10.1504/ijep.2014.065921") # 公式

demo_parse_pdf(book_name="scihub/scihub_23900000/libgen.scimag23969000-23969999.zip_10.1016/S0008-4182(06)80004-9") # 判断是否原生文本。DONE
demo_parse_pdf(book_name="scihub/scihub_69400000/libgen.scimag69479000-69479999.zip_10.1002/0470871660.part1")      # 段落连接,分隔符。TODO
demo_parse_pdf(book_name="scihub/scihub_38000000/libgen.scimag38025000-38025999.zip_10.1557/JMR.2001.0368")         # 标题,标题字体比正文略小。TODO

demo_parse_pdf(book_name="scihub/scihub_12600000/libgen.scimag12683000-12683999.zip_10.1007/s10126-001-0057-7")        # one block one line. SKIP
demo_parse_pdf(book_name="scihub/scihub_68900000/libgen.scimag68948000-68948999.zip_10.1002/uog.18760")                #  # 标题和正文混在一个block内部。SKIP
demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09782000-09782999.zip_10.1111/j.1540-627x.2006.00176.x") # 标题和正文混在一个block内部。SKIP

demo_parse_pdf(book_name="scihub/scihub_45700000/libgen.scimag45725000-45725999.zip_10.1210/en.2008-1281")        # 段落合并。DONE。S3文件失效。
demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scismag47212000-47212999.zip_10.7589/0090-3558-40.3.579") # 段落合并。DONE。S3文件失效。
demo_parse_pdf(book_name="scihub/scihub_83100000/libgen.scimag83192000-83192999.zip_10.1680/jcoma.20.00045")      # 段落合并,标题等。DONE。S3文件失效。

demo_parse_pdf(book_name="scihub/scihub_47200000/libgen.scismag47212000-47212999.zip_10.7589/0090-3558-40.3.579")   # 段落合并。DONE
demo_parse_pdf(book_name="scihub/scihub_80800000/libgen.scimag80896000-80896999.zip_10.1080/13552600.2019.1695968") # 跨页合并paragraph。DONE

demo_parse_pdf(book_name="scihub/scihub_59800000/libgen.scimag59898000-59898999.zip_10.1002/0471469572.ch5") # 上一段以结束标点结束时,误合并了第二段。DONE.该PDF存在大量图注干扰正文合并的情况。TODO

demo_parse_pdf(book_name="scihub/scihub_61700000/libgen.scimag61796000-61796999.zip_10.1080/10511970.2016.1162889")    # 上一段以结束标点结束时,误合并了第二段。DONE
demo_parse_pdf(book_name="scihub/scihub_54500000/libgen.scimag54597000-54597999.zip_10.1186/s40814-015-0033-z")        # DONE
demo_parse_pdf(book_name="scihub/scihub_82000000/libgen.scimag82096000-82096999.zip_10.1016/j.mehy.2020.109851")       # 该部分文字被当成页眉页脚删除。SKIP
demo_parse_pdf(book_name="scihub/scihub_84600000/libgen.scimag84652000-84652999.zip_10.1163/1876312x-00001010")        # 内容丢失。DONE
demo_parse_pdf(book_name="scihub/scihub_17000000/libgen.scimag17000000-17000999.zip_10.1016/j.pain.2004.06.005")       #  No19,排查多出空格的情况。DONE
demo_parse_pdf(book_name="scihub/scihub_01500000/libgen.scimag01561000-01561999.zip_10.1002/jmor.10887")               # 标题。TODO
demo_parse_pdf(book_name="scihub/scihub_09700000/libgen.scimag09782000-09782999.zip_10.1111/j.1540-627x.2006.00176.x") # 段落合并。TODO
demo_parse_pdf(book_name="scihub/scihub_50900000/libgen.scimag50902000-50902999.zip_10.1007/s12274-016-1035-8")        # 段落合并。TODO
demo_parse_pdf(book_name="scihub/scihub_62600000/libgen.scimag62633000-62633999.zip_10.2169/internalmedicine.54.2755") # 段落合并。TODO
demo_parse_pdf(book_name="scihub/scihub_73400000/libgen.scimag73490000-73490999.zip_10.1051/epjconf/201714609022")     # 段落合并。TODO
demo_parse_pdf(book_name="scihub/scihub_80100000/libgen.scimag80185000-80185999.zip_10.1353/sib.0.0003")               # 段落合并。2006年的论文。PDF格式不规范。问题较多。TODO
demo_parse_pdf(book_name="scihub/scihub_62200000/libgen.scimag62272000-62272999.zip_10.1038/srep31158")                # 下一页para blocks为空,导致跨页段落合并失败。DONE

demo_parse_pdf(book_name="scihub/scihub_21100000/libgen.scimag21136000-21136999.zip_10.1080/10407780490277879")              # 29. 段落以一个大写字母开头。DONE
demo_parse_pdf(book_name="scihub/scihub_09800000/libgen.scimag09855000-09855999.zip_10.1111/j.1574-0862.2006.00110.x")       # 2. DONE
demo_parse_pdf(book_name="scihub/scihub_29400000/libgen.scimag29456000-29456999.zip_10.1177/0883911505049656")               # 8,9
demo_parse_pdf(book_name="scihub/scihub_31800000/libgen.scimag31839000-31839999.zip_10.4028/www.scientific.net/jbbte.19.99") # 14
demo_parse_pdf(book_name="scihub/scihub_30200000/libgen.scimag30263000-30263999.zip_10.1081/scc-200036639")                  # 21
demo_parse_pdf(book_name="scihub/scihub_58800000/libgen.scimag58887000-58887999.zip_10.1097/gme.0b013e3181c17c06")           # 30
demo_parse_pdf(book_name="scihub/scihub_09000000/libgen.scimag09082000-09082999.zip_10.1111/j.1365-2222.2010.03605.x")       # 31,32
demo_parse_pdf(book_name="scihub/scihub_71200000/libgen.scimag71224000-71224999.zip_10.1038/s41396-018-0231-9")              # 34
demo_parse_pdf(book_name="scihub/scihub_24600000/libgen.scimag24665000-24665999.zip_10.1016/S0387-7604(89)80007-5")          # 39

# """
# Found by wang zhi
# 2024-1-24
# """
demo_parse_pdf(book_name="scihub/scihub_42100000/libgen.scimag42132000-42132999.zip_10.1080/15205430903457430")          # 3
demo_parse_pdf(book_name="scihub/scihub_20000000/libgen.scimag20099000-20099999.zip_10.1016/s0015-0282(02)03774-3")      # 4
demo_parse_pdf(book_name="scihub/scihub_64900000/libgen.scimag64993000-64993999.zip_10.1097/SIH.0000000000000179")       # 5
demo_parse_pdf(book_name="scihub/scihub_01000000/libgen.scimag01040000-01040999.zip_10.1002/chin.200846224")             # 12
demo_parse_pdf(book_name="scihub/scihub_42100000/libgen.scimag42162000-42162999.zip_10.1093/notesj/gjm116")              # 13
demo_parse_pdf(book_name="scihub/scihub_73100000/libgen.scimag73198000-73198999.zip_10.1007/978-1-4842-4197-4_4")        # 20
demo_parse_pdf(book_name="scihub/scihub_16500000/libgen.scimag16596000-16596999.zip_10.1016/j.febslet.2005.05.011")      # 24
demo_parse_pdf(book_name="scihub/scihub_53900000/libgen.scimag53941000-53941999.zip_10.1016/s0264-3707(15)00099-x")      # 26
demo_parse_pdf(book_name="scihub/scihub_60800000/libgen.scimag60801000-60801999.zip_10.1680/mpal.2014.167.1.61")         # 27
demo_parse_pdf(book_name="scihub/scihub_14100000/libgen.scimag14109000-14109999.zip_10.1016/s0960-9822(00)00772-7")      # 32
demo_parse_pdf(book_name="scihub/scihub_76200000/libgen.scimag76297000-76297999.zip_10.4018/jehmc.2011040101")           # 33
demo_parse_pdf(book_name="scihub/scihub_77600000/libgen.scimag77607000-77607999.zip_10.1016/j.nut.2019.08.013")          # 40
demo_parse_pdf(book_name="scihub/scihub_82100000/libgen.scimag82168000-82168999.zip_10.1061/(ASCE)SU.1943-5428.0000323") # 42

# """
# 跨页合并段落多出文字问题
# """

demo_parse_pdf(book_name="scihub/scihub_48600000/libgen.scimag48645000-48645999.zip_10.1163/22941932-90000220")        # wang zhi, 31。DONE
demo_parse_pdf(book_name="scihub/scihub_09000000/libgen.scimag09082000-09082999.zip_10.1111/j.1365-2222.2010.03605.x") # xiang, 32
demo_parse_pdf(book_name="scihub/scihub_09800000/libgen.scimag09855000-09855999.zip_10.1111/j.1574-0862.2006.00110.x") # wang zhi, 18。DONE
demo_parse_pdf(book_name="scihub/scihub_76200000/libgen.scimag76297000-76297999.zip_10.4018/jehmc.2011040101")         # xiang, 4,5,6。 DONE

# """
# 内容丢失问题
# """

# """
# 标题丢失
# """
demo_parse_pdf(book_name="scihub/scihub_31800000/libgen.scimag31824000-31824999.zip_10.1109/med.2012.6265668") # xiang. 被text_block_horizontal_overlap规则drop.

demo_parse_pdf(book_name="scihub/scihub_01800000/libgen.scimag01870000-01870999.zip_10.1006/jcis.2000.7105") # wangzhi, 1-24, 67. 内容缺失,丢失内容 "thus, there is a". Checking...

demo_parse_pdf(book_name="scihub/") #

# """
# Found by Liu Silu
# 2024-02-04
# """
demo_parse_pdf(book_name="scihub/scihub_38900000/libgen.scimag38917000-38917999.zip_10.1142/S0219649208002159") #







demo_parse_pdf(book_name="scihub/") #