Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
crnn_pytorch
Commits
cb0dff28
Commit
cb0dff28
authored
Oct 09, 2024
by
dengjb
Browse files
Add create_datasets.py
parent
dc8c4458
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
97 additions
and
0 deletions
+97
-0
create_dataset.py
create_dataset.py
+97
-0
No files found.
create_dataset.py
0 → 100644
View file @
cb0dff28
import
os
import
lmdb
# install lmdb by "pip install lmdb"
import
cv2
import
numpy
as
np
def
checkImageIsValid
(
imageBin
):
if
imageBin
is
None
:
return
False
imageBuf
=
np
.
frombuffer
(
imageBin
,
dtype
=
np
.
uint8
)
try
:
img
=
cv2
.
imdecode
(
imageBuf
,
cv2
.
IMREAD_GRAYSCALE
)
imgH
,
imgW
=
img
.
shape
[
0
],
img
.
shape
[
1
]
if
imgH
*
imgW
==
0
:
return
False
return
True
except
:
return
False
def
writeCache
(
env
,
cache
):
with
env
.
begin
(
write
=
True
)
as
txn
:
for
k
,
v
in
cache
.
items
():
#print(k)
txn
.
put
(
k
.
encode
(),
v
)
def
createDataset
(
outputPath
,
imagePathList
,
labelList
,
lexiconList
=
None
,
checkValid
=
True
):
"""
Create LMDB dataset for CRNN training.
ARGS:
outputPath : LMDB output path
imagePathList : list of image path
labelList : list of corresponding groundtruth texts
lexiconList : (optional) list of lexicon lists
checkValid : if true, check the validity of every image
"""
assert
(
len
(
imagePathList
)
==
len
(
labelList
))
nSamples
=
len
(
imagePathList
)
env
=
lmdb
.
open
(
outputPath
,
map_size
=
1099511627776
)
cache
=
{}
cnt
=
1
#for i in range(100):
for
i
in
range
(
nSamples
):
imagePath
=
imagePathList
[
i
]
label
=
labelList
[
i
]
if
not
os
.
path
.
exists
(
imagePath
):
print
(
'%s does not exist'
%
imagePath
)
continue
with
open
(
imagePath
,
'rb'
)
as
f
:
imageBin
=
f
.
read
()
if
checkValid
:
if
not
checkImageIsValid
(
imageBin
):
print
(
'%s is not a valid image'
%
imagePath
)
continue
imageKey
=
'image-%09d'
%
cnt
labelKey
=
'label-%09d'
%
cnt
cache
[
imageKey
]
=
imageBin
cache
[
labelKey
]
=
label
.
encode
()
if
lexiconList
:
lexiconKey
=
'lexicon-%09d'
%
cnt
cache
[
lexiconKey
]
=
' '
.
join
(
lexiconList
[
i
]).
encode
()
if
cnt
%
1000
==
0
:
writeCache
(
env
,
cache
)
cache
=
{}
print
(
'Written %d / %d'
%
(
cnt
,
nSamples
))
cnt
+=
1
nSamples
=
cnt
-
1
cache
[
'num-samples'
]
=
str
(
nSamples
).
encode
()
writeCache
(
env
,
cache
)
print
(
'Created dataset with %d samples'
%
nSamples
)
def
parse_labels
(
path
):
labels
,
image_path
=
[],
[]
with
open
(
path
,
'r'
)
as
f
:
lines
=
f
.
readlines
()
for
line
in
lines
:
image
,
label
=
line
.
strip
(
"
\n
"
).
split
(
' '
)
labels
.
append
(
label
)
image
=
"./90kDICT32px/"
+
image
[
1
:]
image_path
.
append
(
image
)
return
labels
,
image_path
if
__name__
==
'__main__'
:
train_labels
,
train_images
=
parse_labels
(
"90kDICT32px/annotation_train.txt"
)
with
open
(
"90kDICT32px/lexicon.txt"
,
'r'
)
as
f
:
lexicon_list
=
[]
for
line
in
f
.
readlines
():
lexicon_list
.
append
(
line
.
strip
(
"
\n
"
))
#print(train_labels)
print
(
"="
*
50
)
#print(train_images)
createDataset
(
"./output_dataset"
,
train_images
,
train_labels
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment