Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
paddle_dbnet
Commits
3302a0b1
Unverified
Commit
3302a0b1
authored
Jun 09, 2021
by
zhoujun
Committed by
GitHub
Jun 09, 2021
Browse files
Revert "add table eval and predict script" (#3062)
parent
85aeae71
Changes
32
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
7 additions
and
1160 deletions
+7
-1160
ppstructure/table/predict_structure.py
ppstructure/table/predict_structure.py
+0
-141
ppstructure/table/predict_table.py
ppstructure/table/predict_table.py
+0
-221
ppstructure/table/table_metric/__init__.py
ppstructure/table/table_metric/__init__.py
+0
-16
ppstructure/table/table_metric/parallel.py
ppstructure/table/table_metric/parallel.py
+0
-51
ppstructure/table/table_metric/table_metric.py
ppstructure/table/table_metric/table_metric.py
+0
-247
ppstructure/table/tablepyxl/__init__.py
ppstructure/table/tablepyxl/__init__.py
+0
-13
ppstructure/table/tablepyxl/style.py
ppstructure/table/tablepyxl/style.py
+0
-283
ppstructure/table/tablepyxl/tablepyxl.py
ppstructure/table/tablepyxl/tablepyxl.py
+0
-118
ppstructure/utility.py
ppstructure/utility.py
+0
-59
tools/infer/predict_det.py
tools/infer/predict_det.py
+1
-1
tools/infer/predict_system.py
tools/infer/predict_system.py
+5
-4
tools/infer/utility.py
tools/infer/utility.py
+1
-6
No files found.
ppstructure/table/predict_structure.py
deleted
100755 → 0
View file @
85aeae71
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
sys
.
path
.
append
(
__dir__
)
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
__dir__
,
'../..'
)))
os
.
environ
[
"FLAGS_allocator_strategy"
]
=
'auto_growth'
import
cv2
import
numpy
as
np
import
math
import
time
import
traceback
import
paddle
import
tools.infer.utility
as
utility
from
ppocr.data
import
create_operators
,
transform
from
ppocr.postprocess
import
build_post_process
from
ppocr.utils.logging
import
get_logger
from
ppocr.utils.utility
import
get_image_file_list
,
check_and_read_gif
logger
=
get_logger
()
class
TableStructurer
(
object
):
def
__init__
(
self
,
args
):
pre_process_list
=
[{
'ResizeTableImage'
:
{
'max_len'
:
args
.
structure_max_len
}
},
{
'NormalizeImage'
:
{
'std'
:
[
0.229
,
0.224
,
0.225
],
'mean'
:
[
0.485
,
0.456
,
0.406
],
'scale'
:
'1./255.'
,
'order'
:
'hwc'
}
},
{
'PaddingTableImage'
:
None
},
{
'ToCHWImage'
:
None
},
{
'KeepKeys'
:
{
'keep_keys'
:
[
'image'
]
}
}]
postprocess_params
=
{
'name'
:
'TableLabelDecode'
,
"character_type"
:
args
.
structure_char_type
,
"character_dict_path"
:
args
.
structure_char_dict_path
,
"max_text_length"
:
args
.
structure_max_text_length
,
"max_elem_length"
:
args
.
structure_max_elem_length
,
"max_cell_num"
:
args
.
structure_max_cell_num
}
self
.
preprocess_op
=
create_operators
(
pre_process_list
)
self
.
postprocess_op
=
build_post_process
(
postprocess_params
)
self
.
predictor
,
self
.
input_tensor
,
self
.
output_tensors
=
\
utility
.
create_predictor
(
args
,
'structure'
,
logger
)
def
__call__
(
self
,
img
):
ori_im
=
img
.
copy
()
data
=
{
'image'
:
img
}
data
=
transform
(
data
,
self
.
preprocess_op
)
img
=
data
[
0
]
if
img
is
None
:
return
None
,
0
img
=
np
.
expand_dims
(
img
,
axis
=
0
)
img
=
img
.
copy
()
starttime
=
time
.
time
()
self
.
input_tensor
.
copy_from_cpu
(
img
)
self
.
predictor
.
run
()
outputs
=
[]
for
output_tensor
in
self
.
output_tensors
:
output
=
output_tensor
.
copy_to_cpu
()
outputs
.
append
(
output
)
preds
=
{}
preds
[
'structure_probs'
]
=
outputs
[
1
]
preds
[
'loc_preds'
]
=
outputs
[
0
]
post_result
=
self
.
postprocess_op
(
preds
)
structure_str_list
=
post_result
[
'structure_str_list'
]
res_loc
=
post_result
[
'res_loc'
]
imgh
,
imgw
=
ori_im
.
shape
[
0
:
2
]
res_loc_final
=
[]
for
rno
in
range
(
len
(
res_loc
[
0
])):
x0
,
y0
,
x1
,
y1
=
res_loc
[
0
][
rno
]
left
=
max
(
int
(
imgw
*
x0
),
0
)
top
=
max
(
int
(
imgh
*
y0
),
0
)
right
=
min
(
int
(
imgw
*
x1
),
imgw
-
1
)
bottom
=
min
(
int
(
imgh
*
y1
),
imgh
-
1
)
res_loc_final
.
append
([
left
,
top
,
right
,
bottom
])
structure_str_list
=
structure_str_list
[
0
][:
-
1
]
structure_str_list
=
[
'<html>'
,
'<body>'
,
'<table>'
]
+
structure_str_list
+
[
'</table>'
,
'</body>'
,
'</html>'
]
elapse
=
time
.
time
()
-
starttime
return
(
structure_str_list
,
res_loc_final
),
elapse
def
main
(
args
):
image_file_list
=
get_image_file_list
(
args
.
image_dir
)
table_structurer
=
TableStructurer
(
args
)
count
=
0
total_time
=
0
for
image_file
in
image_file_list
:
img
,
flag
=
check_and_read_gif
(
image_file
)
if
not
flag
:
img
=
cv2
.
imread
(
image_file
)
if
img
is
None
:
logger
.
info
(
"error in loading image:{}"
.
format
(
image_file
))
continue
structure_res
,
elapse
=
table_structurer
(
img
)
logger
.
info
(
"result: {}"
.
format
(
structure_res
))
if
count
>
0
:
total_time
+=
elapse
count
+=
1
logger
.
info
(
"Predict time of {}: {}"
.
format
(
image_file
,
elapse
))
if
__name__
==
"__main__"
:
main
(
utility
.
parse_args
())
ppstructure/table/predict_table.py
deleted
100644 → 0
View file @
85aeae71
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
subprocess
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
sys
.
path
.
append
(
__dir__
)
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
__dir__
,
'..'
)))
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
__dir__
,
'../..'
)))
os
.
environ
[
"FLAGS_allocator_strategy"
]
=
'auto_growth'
import
cv2
import
copy
import
numpy
as
np
import
time
import
tools.infer.predict_rec
as
predict_rec
import
tools.infer.predict_det
as
predict_det
from
ppocr.utils.utility
import
get_image_file_list
,
check_and_read_gif
from
ppocr.utils.logging
import
get_logger
from
ppstructure.table.matcher
import
distance
,
compute_iou
from
ppstructure.utility
import
parse_args
import
ppstructure.table.predict_structure
as
predict_strture
logger
=
get_logger
()
def
expand
(
pix
,
det_box
,
shape
):
x0
,
y0
,
x1
,
y1
=
det_box
# print(shape)
h
,
w
,
c
=
shape
tmp_x0
=
x0
-
pix
tmp_x1
=
x1
+
pix
tmp_y0
=
y0
-
pix
tmp_y1
=
y1
+
pix
x0_
=
tmp_x0
if
tmp_x0
>=
0
else
0
x1_
=
tmp_x1
if
tmp_x1
<=
w
else
w
y0_
=
tmp_y0
if
tmp_y0
>=
0
else
0
y1_
=
tmp_y1
if
tmp_y1
<=
h
else
h
return
x0_
,
y0_
,
x1_
,
y1_
class
TableSystem
(
object
):
def
__init__
(
self
,
args
,
text_detector
=
None
,
text_recognizer
=
None
):
self
.
text_detector
=
predict_det
.
TextDetector
(
args
)
if
text_detector
is
None
else
text_detector
self
.
text_recognizer
=
predict_rec
.
TextRecognizer
(
args
)
if
text_recognizer
is
None
else
text_recognizer
self
.
table_structurer
=
predict_strture
.
TableStructurer
(
args
)
def
__call__
(
self
,
img
):
ori_im
=
img
.
copy
()
structure_res
,
elapse
=
self
.
table_structurer
(
copy
.
deepcopy
(
img
))
dt_boxes
,
elapse
=
self
.
text_detector
(
copy
.
deepcopy
(
img
))
dt_boxes
=
sorted_boxes
(
dt_boxes
)
r_boxes
=
[]
for
box
in
dt_boxes
:
x_min
=
box
[:,
0
].
min
()
-
1
x_max
=
box
[:,
0
].
max
()
+
1
y_min
=
box
[:,
1
].
min
()
-
1
y_max
=
box
[:,
1
].
max
()
+
1
box
=
[
x_min
,
y_min
,
x_max
,
y_max
]
r_boxes
.
append
(
box
)
dt_boxes
=
np
.
array
(
r_boxes
)
logger
.
debug
(
"dt_boxes num : {}, elapse : {}"
.
format
(
len
(
dt_boxes
),
elapse
))
if
dt_boxes
is
None
:
return
None
,
None
img_crop_list
=
[]
for
i
in
range
(
len
(
dt_boxes
)):
det_box
=
dt_boxes
[
i
]
x0
,
y0
,
x1
,
y1
=
expand
(
2
,
det_box
,
ori_im
.
shape
)
text_rect
=
ori_im
[
int
(
y0
):
int
(
y1
),
int
(
x0
):
int
(
x1
),
:]
img_crop_list
.
append
(
text_rect
)
rec_res
,
elapse
=
self
.
text_recognizer
(
img_crop_list
)
logger
.
debug
(
"rec_res num : {}, elapse : {}"
.
format
(
len
(
rec_res
),
elapse
))
pred_html
,
pred
=
self
.
rebuild_table
(
structure_res
,
dt_boxes
,
rec_res
)
return
pred_html
def
rebuild_table
(
self
,
structure_res
,
dt_boxes
,
rec_res
):
pred_structures
,
pred_bboxes
=
structure_res
matched_index
=
self
.
match_result
(
dt_boxes
,
pred_bboxes
)
pred_html
,
pred
=
self
.
get_pred_html
(
pred_structures
,
matched_index
,
rec_res
)
return
pred_html
,
pred
def
match_result
(
self
,
dt_boxes
,
pred_bboxes
):
matched
=
{}
for
i
,
gt_box
in
enumerate
(
dt_boxes
):
# gt_box = [np.min(gt_box[:, 0]), np.min(gt_box[:, 1]), np.max(gt_box[:, 0]), np.max(gt_box[:, 1])]
distances
=
[]
for
j
,
pred_box
in
enumerate
(
pred_bboxes
):
distances
.
append
(
(
distance
(
gt_box
,
pred_box
),
1.
-
compute_iou
(
gt_box
,
pred_box
)))
# 获取两两cell之间的L1距离和 1- IOU
sorted_distances
=
distances
.
copy
()
# 根据距离和IOU挑选最"近"的cell
sorted_distances
=
sorted
(
sorted_distances
,
key
=
lambda
item
:
(
item
[
1
],
item
[
0
]))
if
distances
.
index
(
sorted_distances
[
0
])
not
in
matched
.
keys
():
matched
[
distances
.
index
(
sorted_distances
[
0
])]
=
[
i
]
else
:
matched
[
distances
.
index
(
sorted_distances
[
0
])].
append
(
i
)
return
matched
def
get_pred_html
(
self
,
pred_structures
,
matched_index
,
ocr_contents
):
end_html
=
[]
td_index
=
0
for
tag
in
pred_structures
:
if
'</td>'
in
tag
:
if
td_index
in
matched_index
.
keys
():
b_with
=
False
if
'<b>'
in
ocr_contents
[
matched_index
[
td_index
][
0
]]
and
len
(
matched_index
[
td_index
])
>
1
:
b_with
=
True
end_html
.
extend
(
'<b>'
)
for
i
,
td_index_index
in
enumerate
(
matched_index
[
td_index
]):
content
=
ocr_contents
[
td_index_index
][
0
]
if
len
(
matched_index
[
td_index
])
>
1
:
if
len
(
content
)
==
0
:
continue
if
content
[
0
]
==
' '
:
content
=
content
[
1
:]
if
'<b>'
in
content
:
content
=
content
[
3
:]
if
'</b>'
in
content
:
content
=
content
[:
-
4
]
if
len
(
content
)
==
0
:
continue
if
i
!=
len
(
matched_index
[
td_index
])
-
1
and
' '
!=
content
[
-
1
]:
content
+=
' '
end_html
.
extend
(
content
)
if
b_with
:
end_html
.
extend
(
'</b>'
)
end_html
.
append
(
tag
)
td_index
+=
1
else
:
end_html
.
append
(
tag
)
return
''
.
join
(
end_html
),
end_html
def
sorted_boxes
(
dt_boxes
):
"""
Sort text boxes in order from top to bottom, left to right
args:
dt_boxes(array):detected text boxes with shape [4, 2]
return:
sorted boxes(array) with shape [4, 2]
"""
num_boxes
=
dt_boxes
.
shape
[
0
]
sorted_boxes
=
sorted
(
dt_boxes
,
key
=
lambda
x
:
(
x
[
0
][
1
],
x
[
0
][
0
]))
_boxes
=
list
(
sorted_boxes
)
for
i
in
range
(
num_boxes
-
1
):
if
abs
(
_boxes
[
i
+
1
][
0
][
1
]
-
_boxes
[
i
][
0
][
1
])
<
10
and
\
(
_boxes
[
i
+
1
][
0
][
0
]
<
_boxes
[
i
][
0
][
0
]):
tmp
=
_boxes
[
i
]
_boxes
[
i
]
=
_boxes
[
i
+
1
]
_boxes
[
i
+
1
]
=
tmp
return
_boxes
def
to_excel
(
html_table
,
excel_path
):
from
tablepyxl
import
tablepyxl
tablepyxl
.
document_to_xl
(
html_table
,
excel_path
)
def
main
(
args
):
image_file_list
=
get_image_file_list
(
args
.
image_dir
)
image_file_list
=
image_file_list
[
args
.
process_id
::
args
.
total_process_num
]
os
.
makedirs
(
args
.
output
,
exist_ok
=
True
)
text_sys
=
TableSystem
(
args
)
img_num
=
len
(
image_file_list
)
for
i
,
image_file
in
enumerate
(
image_file_list
):
logger
.
info
(
"[{}/{}] {}"
.
format
(
i
,
img_num
,
image_file
))
img
,
flag
=
check_and_read_gif
(
image_file
)
excel_path
=
os
.
path
.
join
(
args
.
table_output
,
os
.
path
.
basename
(
image_file
).
split
(
'.'
)[
0
]
+
'.xlsx'
)
if
not
flag
:
img
=
cv2
.
imread
(
image_file
)
if
img
is
None
:
logger
.
error
(
"error in loading image:{}"
.
format
(
image_file
))
continue
starttime
=
time
.
time
()
pred_html
=
text_sys
(
img
)
to_excel
(
pred_html
,
excel_path
)
logger
.
info
(
'excel saved to {}'
.
format
(
excel_path
))
logger
.
info
(
pred_html
)
elapse
=
time
.
time
()
-
starttime
logger
.
info
(
"Predict time : {:.3f}s"
.
format
(
elapse
))
if
__name__
==
"__main__"
:
args
=
parse_args
()
if
args
.
use_mp
:
p_list
=
[]
total_process_num
=
args
.
total_process_num
for
process_id
in
range
(
total_process_num
):
cmd
=
[
sys
.
executable
,
"-u"
]
+
sys
.
argv
+
[
"--process_id={}"
.
format
(
process_id
),
"--use_mp={}"
.
format
(
False
)
]
p
=
subprocess
.
Popen
(
cmd
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stdout
)
p_list
.
append
(
p
)
for
p
in
p_list
:
p
.
wait
()
else
:
main
(
args
)
ppstructure/table/table_metric/__init__.py
deleted
100755 → 0
View file @
85aeae71
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__
=
[
'TEDS'
]
from
.table_metric
import
TEDS
\ No newline at end of file
ppstructure/table/table_metric/parallel.py
deleted
100755 → 0
View file @
85aeae71
from
tqdm
import
tqdm
from
concurrent.futures
import
ProcessPoolExecutor
,
as_completed
def
parallel_process
(
array
,
function
,
n_jobs
=
16
,
use_kwargs
=
False
,
front_num
=
0
):
"""
A parallel version of the map function with a progress bar.
Args:
array (array-like): An array to iterate over.
function (function): A python function to apply to the elements of array
n_jobs (int, default=16): The number of cores to use
use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of
keyword arguments to function
front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job.
Useful for catching bugs
Returns:
[function(array[0]), function(array[1]), ...]
"""
# We run the first few iterations serially to catch bugs
if
front_num
>
0
:
front
=
[
function
(
**
a
)
if
use_kwargs
else
function
(
a
)
for
a
in
array
[:
front_num
]]
else
:
front
=
[]
# If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
if
n_jobs
==
1
:
return
front
+
[
function
(
**
a
)
if
use_kwargs
else
function
(
a
)
for
a
in
tqdm
(
array
[
front_num
:])]
# Assemble the workers
with
ProcessPoolExecutor
(
max_workers
=
n_jobs
)
as
pool
:
# Pass the elements of array into function
if
use_kwargs
:
futures
=
[
pool
.
submit
(
function
,
**
a
)
for
a
in
array
[
front_num
:]]
else
:
futures
=
[
pool
.
submit
(
function
,
a
)
for
a
in
array
[
front_num
:]]
kwargs
=
{
'total'
:
len
(
futures
),
'unit'
:
'it'
,
'unit_scale'
:
True
,
'leave'
:
True
}
# Print out the progress as tasks complete
for
f
in
tqdm
(
as_completed
(
futures
),
**
kwargs
):
pass
out
=
[]
# Get the results from the futures.
for
i
,
future
in
tqdm
(
enumerate
(
futures
)):
try
:
out
.
append
(
future
.
result
())
except
Exception
as
e
:
out
.
append
(
e
)
return
front
+
out
ppstructure/table/table_metric/table_metric.py
deleted
100755 → 0
View file @
85aeae71
# Copyright 2020 IBM
# Author: peter.zhong@au1.ibm.com
#
# This is free software; you can redistribute it and/or modify
# it under the terms of the Apache 2.0 License.
#
# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Apache 2.0 License for more details.
import
distance
from
apted
import
APTED
,
Config
from
apted.helpers
import
Tree
from
lxml
import
etree
,
html
from
collections
import
deque
from
.parallel
import
parallel_process
from
tqdm
import
tqdm
class
TableTree
(
Tree
):
def
__init__
(
self
,
tag
,
colspan
=
None
,
rowspan
=
None
,
content
=
None
,
*
children
):
self
.
tag
=
tag
self
.
colspan
=
colspan
self
.
rowspan
=
rowspan
self
.
content
=
content
self
.
children
=
list
(
children
)
def
bracket
(
self
):
"""Show tree using brackets notation"""
if
self
.
tag
==
'td'
:
result
=
'"tag": %s, "colspan": %d, "rowspan": %d, "text": %s'
%
\
(
self
.
tag
,
self
.
colspan
,
self
.
rowspan
,
self
.
content
)
else
:
result
=
'"tag": %s'
%
self
.
tag
for
child
in
self
.
children
:
result
+=
child
.
bracket
()
return
"{{{}}}"
.
format
(
result
)
class
CustomConfig
(
Config
):
@
staticmethod
def
maximum
(
*
sequences
):
"""Get maximum possible value
"""
return
max
(
map
(
len
,
sequences
))
def
normalized_distance
(
self
,
*
sequences
):
"""Get distance from 0 to 1
"""
return
float
(
distance
.
levenshtein
(
*
sequences
))
/
self
.
maximum
(
*
sequences
)
def
rename
(
self
,
node1
,
node2
):
"""Compares attributes of trees"""
#print(node1.tag)
if
(
node1
.
tag
!=
node2
.
tag
)
or
(
node1
.
colspan
!=
node2
.
colspan
)
or
(
node1
.
rowspan
!=
node2
.
rowspan
):
return
1.
if
node1
.
tag
==
'td'
:
if
node1
.
content
or
node2
.
content
:
#print(node1.content, )
return
self
.
normalized_distance
(
node1
.
content
,
node2
.
content
)
return
0.
class
CustomConfig_del_short
(
Config
):
@
staticmethod
def
maximum
(
*
sequences
):
"""Get maximum possible value
"""
return
max
(
map
(
len
,
sequences
))
def
normalized_distance
(
self
,
*
sequences
):
"""Get distance from 0 to 1
"""
return
float
(
distance
.
levenshtein
(
*
sequences
))
/
self
.
maximum
(
*
sequences
)
def
rename
(
self
,
node1
,
node2
):
"""Compares attributes of trees"""
if
(
node1
.
tag
!=
node2
.
tag
)
or
(
node1
.
colspan
!=
node2
.
colspan
)
or
(
node1
.
rowspan
!=
node2
.
rowspan
):
return
1.
if
node1
.
tag
==
'td'
:
if
node1
.
content
or
node2
.
content
:
#print('before')
#print(node1.content, node2.content)
#print('after')
node1_content
=
node1
.
content
node2_content
=
node2
.
content
if
len
(
node1_content
)
<
3
:
node1_content
=
[
'####'
]
if
len
(
node2_content
)
<
3
:
node2_content
=
[
'####'
]
return
self
.
normalized_distance
(
node1_content
,
node2_content
)
return
0.
class
CustomConfig_del_block
(
Config
):
@
staticmethod
def
maximum
(
*
sequences
):
"""Get maximum possible value
"""
return
max
(
map
(
len
,
sequences
))
def
normalized_distance
(
self
,
*
sequences
):
"""Get distance from 0 to 1
"""
return
float
(
distance
.
levenshtein
(
*
sequences
))
/
self
.
maximum
(
*
sequences
)
def
rename
(
self
,
node1
,
node2
):
"""Compares attributes of trees"""
if
(
node1
.
tag
!=
node2
.
tag
)
or
(
node1
.
colspan
!=
node2
.
colspan
)
or
(
node1
.
rowspan
!=
node2
.
rowspan
):
return
1.
if
node1
.
tag
==
'td'
:
if
node1
.
content
or
node2
.
content
:
node1_content
=
node1
.
content
node2_content
=
node2
.
content
while
' '
in
node1_content
:
print
(
node1_content
.
index
(
' '
))
node1_content
.
pop
(
node1_content
.
index
(
' '
))
while
' '
in
node2_content
:
print
(
node2_content
.
index
(
' '
))
node2_content
.
pop
(
node2_content
.
index
(
' '
))
return
self
.
normalized_distance
(
node1_content
,
node2_content
)
return
0.
class
TEDS
(
object
):
''' Tree Edit Distance basead Similarity
'''
def
__init__
(
self
,
structure_only
=
False
,
n_jobs
=
1
,
ignore_nodes
=
None
):
assert
isinstance
(
n_jobs
,
int
)
and
(
n_jobs
>=
1
),
'n_jobs must be an integer greather than 1'
self
.
structure_only
=
structure_only
self
.
n_jobs
=
n_jobs
self
.
ignore_nodes
=
ignore_nodes
self
.
__tokens__
=
[]
def
tokenize
(
self
,
node
):
''' Tokenizes table cells
'''
self
.
__tokens__
.
append
(
'<%s>'
%
node
.
tag
)
if
node
.
text
is
not
None
:
self
.
__tokens__
+=
list
(
node
.
text
)
for
n
in
node
.
getchildren
():
self
.
tokenize
(
n
)
if
node
.
tag
!=
'unk'
:
self
.
__tokens__
.
append
(
'</%s>'
%
node
.
tag
)
if
node
.
tag
!=
'td'
and
node
.
tail
is
not
None
:
self
.
__tokens__
+=
list
(
node
.
tail
)
def
load_html_tree
(
self
,
node
,
parent
=
None
):
''' Converts HTML tree to the format required by apted
'''
global
__tokens__
if
node
.
tag
==
'td'
:
if
self
.
structure_only
:
cell
=
[]
else
:
self
.
__tokens__
=
[]
self
.
tokenize
(
node
)
cell
=
self
.
__tokens__
[
1
:
-
1
].
copy
()
new_node
=
TableTree
(
node
.
tag
,
int
(
node
.
attrib
.
get
(
'colspan'
,
'1'
)),
int
(
node
.
attrib
.
get
(
'rowspan'
,
'1'
)),
cell
,
*
deque
())
else
:
new_node
=
TableTree
(
node
.
tag
,
None
,
None
,
None
,
*
deque
())
if
parent
is
not
None
:
parent
.
children
.
append
(
new_node
)
if
node
.
tag
!=
'td'
:
for
n
in
node
.
getchildren
():
self
.
load_html_tree
(
n
,
new_node
)
if
parent
is
None
:
return
new_node
def
evaluate
(
self
,
pred
,
true
):
''' Computes TEDS score between the prediction and the ground truth of a
given sample
'''
if
(
not
pred
)
or
(
not
true
):
return
0.0
parser
=
html
.
HTMLParser
(
remove_comments
=
True
,
encoding
=
'utf-8'
)
pred
=
html
.
fromstring
(
pred
,
parser
=
parser
)
true
=
html
.
fromstring
(
true
,
parser
=
parser
)
if
pred
.
xpath
(
'body/table'
)
and
true
.
xpath
(
'body/table'
):
pred
=
pred
.
xpath
(
'body/table'
)[
0
]
true
=
true
.
xpath
(
'body/table'
)[
0
]
if
self
.
ignore_nodes
:
etree
.
strip_tags
(
pred
,
*
self
.
ignore_nodes
)
etree
.
strip_tags
(
true
,
*
self
.
ignore_nodes
)
n_nodes_pred
=
len
(
pred
.
xpath
(
".//*"
))
n_nodes_true
=
len
(
true
.
xpath
(
".//*"
))
n_nodes
=
max
(
n_nodes_pred
,
n_nodes_true
)
tree_pred
=
self
.
load_html_tree
(
pred
)
tree_true
=
self
.
load_html_tree
(
true
)
distance
=
APTED
(
tree_pred
,
tree_true
,
CustomConfig
()).
compute_edit_distance
()
return
1.0
-
(
float
(
distance
)
/
n_nodes
)
else
:
return
0.0
def
batch_evaluate
(
self
,
pred_json
,
true_json
):
''' Computes TEDS score between the prediction and the ground truth of
a batch of samples
@params pred_json: {'FILENAME': 'HTML CODE', ...}
@params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
@output: {'FILENAME': 'TEDS SCORE', ...}
'''
samples
=
true_json
.
keys
()
if
self
.
n_jobs
==
1
:
scores
=
[
self
.
evaluate
(
pred_json
.
get
(
filename
,
''
),
true_json
[
filename
][
'html'
])
for
filename
in
tqdm
(
samples
)]
else
:
inputs
=
[{
'pred'
:
pred_json
.
get
(
filename
,
''
),
'true'
:
true_json
[
filename
][
'html'
]}
for
filename
in
samples
]
scores
=
parallel_process
(
inputs
,
self
.
evaluate
,
use_kwargs
=
True
,
n_jobs
=
self
.
n_jobs
,
front_num
=
1
)
scores
=
dict
(
zip
(
samples
,
scores
))
return
scores
def
batch_evaluate_html
(
self
,
pred_htmls
,
true_htmls
):
''' Computes TEDS score between the prediction and the ground truth of
a batch of samples
'''
if
self
.
n_jobs
==
1
:
scores
=
[
self
.
evaluate
(
pred_html
,
true_html
)
for
(
pred_html
,
true_html
)
in
zip
(
pred_htmls
,
true_htmls
)]
else
:
inputs
=
[{
"pred"
:
pred_html
,
"true"
:
true_html
}
for
(
pred_html
,
true_html
)
in
zip
(
pred_htmls
,
true_htmls
)]
scores
=
parallel_process
(
inputs
,
self
.
evaluate
,
use_kwargs
=
True
,
n_jobs
=
self
.
n_jobs
,
front_num
=
1
)
return
scores
if
__name__
==
'__main__'
:
import
json
import
pprint
with
open
(
'sample_pred.json'
)
as
fp
:
pred_json
=
json
.
load
(
fp
)
with
open
(
'sample_gt.json'
)
as
fp
:
true_json
=
json
.
load
(
fp
)
teds
=
TEDS
(
n_jobs
=
4
)
scores
=
teds
.
batch_evaluate
(
pred_json
,
true_json
)
pp
=
pprint
.
PrettyPrinter
()
pp
.
pprint
(
scores
)
ppstructure/table/tablepyxl/__init__.py
deleted
100644 → 0
View file @
85aeae71
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
ppstructure/table/tablepyxl/style.py
deleted
100644 → 0
View file @
85aeae71
This diff is collapsed.
Click to expand it.
ppstructure/table/tablepyxl/tablepyxl.py
deleted
100644 → 0
View file @
85aeae71
# Do imports like python3 so our package works for 2 and 3
from
__future__
import
absolute_import
from
lxml
import
html
from
openpyxl
import
Workbook
from
openpyxl.utils
import
get_column_letter
from
premailer
import
Premailer
from
tablepyxl.style
import
Table
def
string_to_int
(
s
):
if
s
.
isdigit
():
return
int
(
s
)
return
0
def
get_Tables
(
doc
):
tree
=
html
.
fromstring
(
doc
)
comments
=
tree
.
xpath
(
'//comment()'
)
for
comment
in
comments
:
comment
.
drop_tag
()
return
[
Table
(
table
)
for
table
in
tree
.
xpath
(
'//table'
)]
def
write_rows
(
worksheet
,
elem
,
row
,
column
=
1
):
"""
Writes every tr child element of elem to a row in the worksheet
returns the next row after all rows are written
"""
from
openpyxl.cell.cell
import
MergedCell
initial_column
=
column
for
table_row
in
elem
.
rows
:
for
table_cell
in
table_row
.
cells
:
cell
=
worksheet
.
cell
(
row
=
row
,
column
=
column
)
while
isinstance
(
cell
,
MergedCell
):
column
+=
1
cell
=
worksheet
.
cell
(
row
=
row
,
column
=
column
)
colspan
=
string_to_int
(
table_cell
.
element
.
get
(
"colspan"
,
"1"
))
rowspan
=
string_to_int
(
table_cell
.
element
.
get
(
"rowspan"
,
"1"
))
if
rowspan
>
1
or
colspan
>
1
:
worksheet
.
merge_cells
(
start_row
=
row
,
start_column
=
column
,
end_row
=
row
+
rowspan
-
1
,
end_column
=
column
+
colspan
-
1
)
cell
.
value
=
table_cell
.
value
table_cell
.
format
(
cell
)
min_width
=
table_cell
.
get_dimension
(
'min-width'
)
max_width
=
table_cell
.
get_dimension
(
'max-width'
)
if
colspan
==
1
:
# Initially, when iterating for the first time through the loop, the width of all the cells is None.
# As we start filling in contents, the initial width of the cell (which can be retrieved by:
# worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
# cell in the same column (i.e. width of A2 = width of A1)
width
=
max
(
worksheet
.
column_dimensions
[
get_column_letter
(
column
)].
width
or
0
,
len
(
table_cell
.
value
)
+
2
)
if
max_width
and
width
>
max_width
:
width
=
max_width
elif
min_width
and
width
<
min_width
:
width
=
min_width
worksheet
.
column_dimensions
[
get_column_letter
(
column
)].
width
=
width
column
+=
colspan
row
+=
1
column
=
initial_column
return
row
def
table_to_sheet
(
table
,
wb
):
"""
Takes a table and workbook and writes the table to a new sheet.
The sheet title will be the same as the table attribute name.
"""
ws
=
wb
.
create_sheet
(
title
=
table
.
element
.
get
(
'name'
))
insert_table
(
table
,
ws
,
1
,
1
)
def
document_to_workbook
(
doc
,
wb
=
None
,
base_url
=
None
):
"""
Takes a string representation of an html document and writes one sheet for
every table in the document.
The workbook is returned
"""
if
not
wb
:
wb
=
Workbook
()
wb
.
remove
(
wb
.
active
)
inline_styles_doc
=
Premailer
(
doc
,
base_url
=
base_url
,
remove_classes
=
False
).
transform
()
tables
=
get_Tables
(
inline_styles_doc
)
for
table
in
tables
:
table_to_sheet
(
table
,
wb
)
return
wb
def
document_to_xl
(
doc
,
filename
,
base_url
=
None
):
"""
Takes a string representation of an html document and writes one sheet for
every table in the document. The workbook is written out to a file called filename
"""
wb
=
document_to_workbook
(
doc
,
base_url
=
base_url
)
wb
.
save
(
filename
)
def
insert_table
(
table
,
worksheet
,
column
,
row
):
if
table
.
head
:
row
=
write_rows
(
worksheet
,
table
.
head
,
row
,
column
)
if
table
.
body
:
row
=
write_rows
(
worksheet
,
table
.
body
,
row
,
column
)
def
insert_table_at_cell
(
table
,
cell
):
"""
Inserts a table at the location of an openpyxl Cell object.
"""
ws
=
cell
.
parent
column
,
row
=
cell
.
column
,
cell
.
row
insert_table
(
table
,
ws
,
column
,
row
)
\ No newline at end of file
ppstructure/utility.py
deleted
100644 → 0
View file @
85aeae71
This diff is collapsed.
Click to expand it.
tools/infer/predict_det.py
View file @
3302a0b1
...
...
@@ -43,7 +43,7 @@ class TextDetector(object):
pre_process_list
=
[{
'DetResizeForTest'
:
{
'limit_side_len'
:
args
.
det_limit_side_len
,
'limit_type'
:
args
.
det_limit_type
,
'limit_type'
:
args
.
det_limit_type
}
},
{
'NormalizeImage'
:
{
...
...
tools/infer/predict_system.py
View file @
3302a0b1
This diff is collapsed.
Click to expand it.
tools/infer/utility.py
View file @
3302a0b1
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment