Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f14e50e2
Commit
f14e50e2
authored
Jul 05, 2024
by
赵小蒙
Browse files
remove some code
parent
fb27361e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
0 additions
and
333 deletions
+0
-333
tools/benchmark.py
tools/benchmark.py
+0
-74
tools/clean_photo.py
tools/clean_photo.py
+0
-112
tools/markdown_calculate.py
tools/markdown_calculate.py
+0
-99
tools/scoring.py
tools/scoring.py
+0
-48
No files found.
tools/benchmark.py
deleted
100644 → 0
View file @
fb27361e
import
zipfile
import
os
import
shutil
import
json
import
markdown_calculate
code_path
=
os
.
environ
.
get
(
'GITHUB_WORKSPACE'
)
#数据集存放路径
pdf_dev_path
=
"/share/quyuan/mineru/data/"
#magicpdf最终结果
pdf_res_path
=
"/share/quyuan/mineru/data/mineru"
file_types
=
[
"academic_literature"
,
"atlas"
,
"courseware"
,
"colorful_textbook"
,
"historical_documents"
,
"notes"
,
"ordinary_books"
,
"ordinary_exam_paper"
,
"ordinary_textbook"
,
"research_report"
,
"special_exam_paper"
]
def
test_cli
():
#magicpdf模型输出结果
magicpdf_path
=
os
.
path
.
join
(
pdf_dev_path
,
"output"
)
rm_cmd
=
"rm -rf %s"
%
(
pdf_res_path
)
os
.
system
(
rm_cmd
)
os
.
makedirs
(
pdf_res_path
)
cmd
=
'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}'
%
(
code_path
,
magicpdf_path
)
os
.
system
(
cmd
)
for
root
,
dirs
,
files
in
os
.
walk
(
pdf_res_path
):
for
magic_file
in
files
:
for
file_type
in
file_types
:
target_dir
=
os
.
path
.
join
(
pdf_dev_path
,
"ci"
,
file_type
,
"magicpdf"
)
if
magic_file
.
endswith
(
".md"
)
and
magic_file
.
startswith
(
file_type
):
source_file
=
os
.
path
.
join
(
root
,
magic_file
)
target_file
=
os
.
path
.
join
(
pdf_dev_path
,
"ci"
,
file_type
,
"magicpdf"
,
magic_file
)
if
not
os
.
path
.
exists
(
target_dir
):
os
.
makedirs
(
target_dir
)
shutil
.
copy
(
source_file
,
target_file
)
def
calculate_score
():
data_path
=
os
.
path
.
join
(
pdf_dev_path
,
"ci"
)
cmd
=
"cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s"
%
(
code_path
,
data_path
)
os
.
system
(
cmd
)
cmd
=
"cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s"
%
(
code_path
,
data_path
)
os
.
system
(
cmd
)
score
=
markdown_calculate
.
Scoring
(
os
.
path
.
join
(
data_path
,
"result.json"
))
score
.
calculate_similarity_total
(
"magicpdf"
,
file_types
,
data_path
)
res
=
score
.
summary_scores
()
return
res
def
extrat_zip
(
zip_file_path
,
extract_to_path
):
if
zipfile
.
is_zipfile
(
zip_file_path
):
with
zipfile
.
ZipFile
(
zip_file_path
,
'r'
)
as
zip_ref
:
zip_ref
.
extractall
(
extract_to_path
)
print
(
f
'Files extracted to
{
extract_to_path
}
'
)
else
:
print
(
f
'
{
zip_file_path
}
is not a zip file'
)
def
ci_ben
():
fr
=
open
(
os
.
path
.
join
(
pdf_dev_path
,
"ci"
,
"result.json"
),
"r"
)
lines
=
fr
.
readlines
()
last_line
=
lines
[
-
1
].
strip
()
last_score
=
json
.
loads
(
last_line
)
print
(
"last_score:"
,
last_score
)
last_simscore
=
last_score
[
"average_sim_score"
]
last_editdistance
=
last_score
[
"average_edit_distance"
]
last_bleu
=
last_score
[
"average_bleu_score"
]
extrat_zip
(
os
.
path
.
join
(
pdf_dev_path
,
'output.zip'
),
os
.
path
.
join
(
pdf_dev_path
))
test_cli
()
now_score
=
calculate_score
()
print
(
"now_score:"
,
now_score
)
now_simscore
=
now_score
[
"average_sim_score"
]
now_editdistance
=
now_score
[
"average_edit_distance"
]
now_bleu
=
now_score
[
"average_bleu_score"
]
assert
last_simscore
<=
now_simscore
assert
last_editdistance
<=
now_editdistance
assert
last_bleu
<=
now_bleu
if
__name__
==
"__main__"
:
ci_ben
()
tools/clean_photo.py
deleted
100644 → 0
View file @
fb27361e
import
pypandoc
import
re
import
htmltabletomd
import
os
import
argparse
import
zipfile
parser
=
argparse
.
ArgumentParser
(
description
=
"get tool type"
)
parser
.
add_argument
(
"--tool_name"
,
type
=
str
,
required
=
True
,
help
=
"input tool name"
,
)
parser
.
add_argument
(
"--download_dir"
,
type
=
str
,
required
=
True
,
help
=
"input download dir"
,
)
args
=
parser
.
parse_args
()
def
clean_markdown_images
(
content
):
pattern
=
re
.
compile
(
r
'!\[[^\]]*\]\([^)]*\)'
,
re
.
IGNORECASE
)
cleaned_content
=
pattern
.
sub
(
''
,
content
)
return
cleaned_content
def
clean_ocrmath_photo
(
content
):
pattern
=
re
.
compile
(
r
'\\includegraphics\[.*?\]\{.*?\}'
,
re
.
IGNORECASE
)
cleaned_content
=
pattern
.
sub
(
''
,
content
)
return
cleaned_content
def
convert_html_table_to_md
(
html_table
):
lines
=
html_table
.
strip
().
split
(
'
\n
'
)
md_table
=
''
if
lines
and
'<tr>'
in
lines
[
0
]:
in_thead
=
True
for
line
in
lines
:
if
'<th>'
in
line
:
cells
=
re
.
findall
(
r
'<th>(.*?)</th>'
,
line
)
md_table
+=
'| '
+
' | '
.
join
(
cells
)
+
' |
\n
'
in_thead
=
False
elif
'<td>'
in
line
and
not
in_thead
:
cells
=
re
.
findall
(
r
'<td>(.*?)</td>'
,
line
)
md_table
+=
'| '
+
' | '
.
join
(
cells
)
+
' |
\n
'
md_table
=
md_table
.
rstrip
()
+
'
\n
'
return
md_table
def
convert_latext_to_md
(
content
):
tables
=
re
.
findall
(
r
'\\begin\{tabular\}(.*?)\\end\{tabular\}'
,
content
,
re
.
DOTALL
)
placeholders
=
[]
for
table
in
tables
:
placeholder
=
f
"<!-- TABLE_PLACEHOLDER_
{
len
(
placeholders
)
}
-->"
replace_str
=
f
"
\\
begin{{tabular}}
{
table
}
cl
\\
end{{tabular}}"
content
=
content
.
replace
(
replace_str
,
placeholder
)
try
:
pypandoc
.
convert_text
(
replace_str
,
format
=
"latex"
,
to
=
"md"
,
outputfile
=
"output.md"
,
encoding
=
"utf-8"
)
except
:
markdown_string
=
replace_str
else
:
markdown_string
=
open
(
'output.md'
,
'r'
,
encoding
=
'utf-8'
).
read
()
placeholders
.
append
((
placeholder
,
markdown_string
))
new_content
=
content
for
placeholder
,
md_table
in
placeholders
:
new_content
=
new_content
.
replace
(
placeholder
,
md_table
)
# 写入文件
return
new_content
def
convert_htmltale_to_md
(
content
):
tables
=
re
.
findall
(
r
'<table>(.*?)</table>'
,
content
,
re
.
DOTALL
)
placeholders
=
[]
for
table
in
tables
:
placeholder
=
f
"<!-- TABLE_PLACEHOLDER_
{
len
(
placeholders
)
}
-->"
content
=
content
.
replace
(
f
"<table>
{
table
}
</table>"
,
placeholder
)
try
:
convert_table
=
htmltabletomd
.
convert_table
(
table
)
except
:
convert_table
=
table
placeholders
.
append
((
placeholder
,
convert_table
))
new_content
=
content
for
placeholder
,
md_table
in
placeholders
:
new_content
=
new_content
.
replace
(
placeholder
,
md_table
)
# 写入文件
return
new_content
def
clean_data
(
prod_type
,
download_dir
):
file_type
=
[
"academic_literature"
,
"atlas"
,
"courseware"
,
"colorful_textbook"
,
"historical_documents"
,
"notes"
,
"ordinary_books"
,
"ordinary_exam_paper"
,
"ordinary_textbook"
,
"research_report"
,
"special_exam_paper"
]
for
filetype
in
file_type
:
tgt_dir
=
os
.
path
.
join
(
download_dir
,
filetype
,
prod_type
,
"cleaned"
)
if
not
os
.
path
.
exists
(
tgt_dir
):
os
.
makedirs
(
tgt_dir
)
source_dir
=
os
.
path
.
join
(
download_dir
,
filetype
,
prod_type
)
filenames
=
os
.
listdir
(
source_dir
)
for
filename
in
filenames
:
if
filename
.
endswith
(
'.md'
):
input_file
=
os
.
path
.
join
(
source_dir
,
filename
)
output_file
=
os
.
path
.
join
(
tgt_dir
,
"cleaned_"
+
filename
)
with
open
(
input_file
,
'r'
,
encoding
=
'utf-8'
)
as
fr
:
content
=
fr
.
read
()
new_content
=
convert_htmltale_to_md
(
content
)
new_content
=
clean_markdown_images
(
new_content
)
new_content
=
clean_ocrmath_photo
(
new_content
)
new_content
=
convert_latext_to_md
(
new_content
)
with
open
(
output_file
,
'w'
,
encoding
=
'utf-8'
)
as
fw
:
fw
.
write
(
new_content
)
if
__name__
==
'__main__'
:
tool_type
=
args
.
tool_name
download_dir
=
args
.
download_dir
clean_data
(
tool_type
,
download_dir
)
tools/markdown_calculate.py
deleted
100644 → 0
View file @
fb27361e
import
os
from
Levenshtein
import
distance
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
,
corpus_bleu
from
nltk.tokenize
import
word_tokenize
import
json
import
re
import
scoring
import
argparse
import
nltk
nltk
.
download
(
'punkt'
)
# 初始化列表来存储编辑距离和BLEU分数
class
Scoring
:
def
__init__
(
self
,
result_path
):
self
.
edit_distances
=
[]
self
.
bleu_scores
=
[]
self
.
sim_scores
=
[]
self
.
filenames
=
[]
self
.
score_dict
=
{}
self
.
anntion_cnt
=
0
self
.
fw
=
open
(
result_path
,
"w+"
)
def
simple_bleu_score
(
self
,
candidate
,
reference
):
candidate_tokens
=
word_tokenize
(
candidate
)
reference_tokens
=
word_tokenize
(
reference
)
return
sentence_bleu
([
reference_tokens
],
candidate_tokens
,
smoothing_function
=
SmoothingFunction
().
method1
)
def
preprocess_string
(
self
,
s
):
sub_enter
=
re
.
sub
(
r
'\n+'
,
'
\n
'
,
s
)
return
re
.
sub
(
r
' '
,
' '
,
sub_enter
)
def
calculate_similarity
(
self
,
annotion
,
actual
,
tool_type
):
class_dict
=
{}
edit_distances
=
[]
bleu_scores
=
[]
sim_scores
=
list
()
total_file
=
0
for
filename
in
os
.
listdir
(
annotion
):
if
filename
.
endswith
(
'.md'
)
and
not
filename
.
startswith
(
'.'
):
# 忽略隐藏文件
total_file
=
total_file
+
1
# 读取A目录中的文件
with
open
(
os
.
path
.
join
(
annotion
,
filename
),
'r'
,
encoding
=
'utf-8'
)
as
file_a
:
content_a
=
file_a
.
read
()
self
.
anntion_cnt
=
self
.
anntion_cnt
+
1
filepath_b
=
os
.
path
.
join
(
actual
,
filename
)
if
os
.
path
.
exists
(
filepath_b
):
with
open
(
filepath_b
,
'r'
,
encoding
=
'utf-8'
)
as
file_b
:
content_b
=
file_b
.
read
()
self
.
filenames
.
append
(
filename
)
# 计算编辑距离
edit_dist
=
distance
(
self
.
preprocess_string
(
content_b
),
self
.
preprocess_string
(
content_a
))
/
max
(
len
(
content_a
),
len
(
content_b
))
self
.
edit_distances
.
append
(
edit_dist
)
edit_distances
.
append
(
edit_dist
)
#计算BLUE分数
bleu_score
=
self
.
simple_bleu_score
(
content_b
,
content_a
)
bleu_scores
.
append
(
bleu_score
)
self
.
bleu_scores
.
append
(
bleu_score
)
#计算marker分数
score
=
scoring
.
score_text
(
content_b
,
content_a
)
sim_scores
.
append
(
score
)
self
.
sim_scores
.
append
(
score
)
class_dict
[
filename
]
=
{
"edit_dist"
:
edit_dist
,
"bleu_score"
:
bleu_score
,
"sim_score"
:
score
}
self
.
score_dict
[
filename
]
=
{
"edit_dist"
:
edit_dist
,
"bleu_score"
:
bleu_score
,
"sim_score"
:
score
}
else
:
print
(
f
"File
{
filename
}
not found in actual directory."
)
# 计算每类平均值
class_average_edit_distance
=
sum
(
edit_distances
)
/
len
(
edit_distances
)
if
edit_distances
else
0
class_average_bleu_score
=
sum
(
bleu_scores
)
/
len
(
bleu_scores
)
if
bleu_scores
else
0
class_average_sim_score
=
sum
(
sim_scores
)
/
len
(
sim_scores
)
if
sim_scores
else
0
self
.
fw
.
write
(
json
.
dumps
(
class_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
ratio
=
len
(
class_dict
)
/
total_file
self
.
fw
.
write
(
f
"
{
tool_type
}
extract ratio:
{
ratio
}
"
+
"
\n
"
)
self
.
fw
.
write
(
f
"
{
tool_type
}
Average Levenshtein Distance:
{
class_average_edit_distance
}
"
+
"
\n
"
)
self
.
fw
.
write
(
f
"
{
tool_type
}
Average BLEU Score:
{
class_average_bleu_score
}
"
+
"
\n
"
)
self
.
fw
.
write
(
f
"
{
tool_type
}
Average Sim Score:
{
class_average_sim_score
}
"
+
"
\n
"
)
print
(
f
"
{
tool_type
}
extract ratio:
{
ratio
}
"
)
print
(
f
"
{
tool_type
}
Average Levenshtein Distance:
{
class_average_edit_distance
}
"
)
print
(
f
"
{
tool_type
}
Average BLEU Score:
{
class_average_bleu_score
}
"
)
print
(
f
"
{
tool_type
}
Average Sim Score:
{
class_average_sim_score
}
"
)
return
self
.
score_dict
def
summary_scores
(
self
):
# 计算整体平均值
over_all_dict
=
dict
()
average_edit_distance
=
sum
(
self
.
edit_distances
)
/
len
(
self
.
edit_distances
)
if
self
.
edit_distances
else
0
average_bleu_score
=
sum
(
self
.
bleu_scores
)
/
len
(
self
.
bleu_scores
)
if
self
.
bleu_scores
else
0
average_sim_score
=
sum
(
self
.
sim_scores
)
/
len
(
self
.
sim_scores
)
if
self
.
sim_scores
else
0
over_all_dict
[
"average_edit_distance"
]
=
average_edit_distance
over_all_dict
[
"average_bleu_score"
]
=
average_bleu_score
over_all_dict
[
"average_sim_score"
]
=
average_sim_score
self
.
fw
.
write
(
json
.
dumps
(
over_all_dict
,
ensure_ascii
=
False
)
+
"
\n
"
)
return
over_all_dict
def
calculate_similarity_total
(
self
,
tool_type
,
file_types
,
download_dir
):
for
file_type
in
file_types
:
annotion
=
os
.
path
.
join
(
download_dir
,
file_type
,
"annotations"
,
"cleaned"
)
actual
=
os
.
path
.
join
(
download_dir
,
file_type
,
tool_type
,
"cleaned"
)
self
.
calculate_similarity
(
annotion
,
actual
,
file_type
)
tools/scoring.py
deleted
100644 → 0
View file @
fb27361e
import
math
from
rapidfuzz
import
fuzz
import
re
import
regex
from
statistics
import
mean
CHUNK_MIN_CHARS
=
25
def
chunk_text
(
text
,
chunk_len
=
500
):
chunks
=
[
text
[
i
:
i
+
chunk_len
]
for
i
in
range
(
0
,
len
(
text
),
chunk_len
)]
chunks
=
[
c
for
c
in
chunks
if
c
.
strip
()
and
len
(
c
)
>
CHUNK_MIN_CHARS
]
return
chunks
def
overlap_score
(
hypothesis_chunks
,
reference_chunks
):
if
len
(
reference_chunks
)
>
0
:
length_modifier
=
len
(
hypothesis_chunks
)
/
len
(
reference_chunks
)
else
:
length_modifier
=
0
search_distance
=
max
(
len
(
reference_chunks
)
//
5
,
10
)
chunk_scores
=
[]
for
i
,
hyp_chunk
in
enumerate
(
hypothesis_chunks
):
max_score
=
0
total_len
=
0
i_offset
=
int
(
i
*
length_modifier
)
chunk_range
=
range
(
max
(
0
,
i_offset
-
search_distance
),
min
(
len
(
reference_chunks
),
i_offset
+
search_distance
))
for
j
in
chunk_range
:
ref_chunk
=
reference_chunks
[
j
]
score
=
fuzz
.
ratio
(
hyp_chunk
,
ref_chunk
,
score_cutoff
=
30
)
/
100
if
score
>
max_score
:
max_score
=
score
total_len
=
len
(
ref_chunk
)
chunk_scores
.
append
(
max_score
)
return
chunk_scores
def
score_text
(
hypothesis
,
reference
):
# Returns a 0-1 alignment score
hypothesis_chunks
=
chunk_text
(
hypothesis
)
reference_chunks
=
chunk_text
(
reference
)
chunk_scores
=
overlap_score
(
hypothesis_chunks
,
reference_chunks
)
if
len
(
chunk_scores
)
>
0
:
mean_score
=
mean
(
chunk_scores
)
return
mean_score
else
:
return
0
#return mean(chunk_scores)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment