Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
donut_pytorch
Commits
8f2d5153
Commit
8f2d5153
authored
Aug 24, 2022
by
Geewook Kim
Browse files
refac: SynthDoG and README.md
parent
ad037f89
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
77 additions
and
83 deletions
+77
-83
synthdog/README.md
synthdog/README.md
+18
-11
synthdog/elements/content.py
synthdog/elements/content.py
+59
-1
synthdog/utils/__init__.py
synthdog/utils/__init__.py
+0
-8
synthdog/utils/text_reader.py
synthdog/utils/text_reader.py
+0
-63
No files found.
synthdog/README.md
View file @
8f2d5153
...
@@ -15,12 +15,12 @@ SynthDoG is synthetic document generator for visual document understanding (VDU)
...
@@ -15,12 +15,12 @@ SynthDoG is synthetic document generator for visual document understanding (VDU)
# Set environment variable (for macOS)
# Set environment variable (for macOS)
$ export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
$ export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
synthtiger -o
{dataset_path}
/SynthDoG_en -c
10
0 -w 4 -v template.py SynthDo
g
config_en.yaml
synthtiger -o
./outputs
/SynthDoG_en -c
5
0 -w 4 -v template.py SynthDo
G
config_en.yaml
{'config': 'config_en.yaml',
{'config': 'config_en.yaml',
'count':
10
0,
'count':
5
0,
'name': 'SynthDo
g
',
'name': 'SynthDo
G
',
'output': 'outputs/SynthDoG_en',
'output': '
./
outputs/SynthDoG_en',
'script': 'template.py',
'script': 'template.py',
'verbose': True,
'verbose': True,
'worker': 4}
'worker': 4}
...
@@ -34,22 +34,29 @@ Generated 2 data
...
@@ -34,22 +34,29 @@ Generated 2 data
Generated 3 data
Generated 3 data
.
.
.
.
Generated
9
9 data
Generated
4
9 data
Generated
10
0 data
Generated
5
0 data
108.74
seconds elapsed
46.32
seconds elapsed
```
```
Some important arguments:
- `
-o
` : directory path to save data.
- `
-c
` : number of data to generate.
- `
-w
` : number of workers.
- `
-v
` : print error messages.
To generate ECJK samples:
To generate ECJK samples:
```bash
```bash
# english
# english
synthtiger -o {dataset_path}
/synthdog-en -w 4
-v template.py SynthDoG config_en.yaml
synthtiger -o {dataset_path}
-c {num_of_data} -w {num_of_workers}
-v template.py SynthDoG config_en.yaml
# chinese
# chinese
synthtiger -o {dataset_path}
/synthdog-zh -w 4
-v template.py SynthDoG config_zh.yaml
synthtiger -o {dataset_path}
-c {num_of_data} -w {num_of_workers}
-v template.py SynthDoG config_zh.yaml
# japanese
# japanese
synthtiger -o {dataset_path}
/synthdog-ja -w 4
-v template.py SynthDoG config_ja.yaml
synthtiger -o {dataset_path}
-c {num_of_data} -w {num_of_workers}
-v template.py SynthDoG config_ja.yaml
# korean
# korean
synthtiger -o {dataset_path}
/synthdog-ko -w 4
-v template.py SynthDoG config_ko.yaml
synthtiger -o {dataset_path}
-c {num_of_data} -w {num_of_workers}
-v template.py SynthDoG config_ko.yaml
``
`
``
`
synthdog/elements/content.py
View file @
8f2d5153
...
@@ -3,12 +3,70 @@ Donut
...
@@ -3,12 +3,70 @@ Donut
Copyright (c) 2022-present NAVER Corp.
Copyright (c) 2022-present NAVER Corp.
MIT License
MIT License
"""
"""
from
collections
import
OrderedDict
import
numpy
as
np
import
numpy
as
np
from
synthtiger
import
components
from
synthtiger
import
components
from
elements.textbox
import
TextBox
from
elements.textbox
import
TextBox
from
layouts
import
GridStack
from
layouts
import
GridStack
from
utils
import
TextReader
class
TextReader
:
def
__init__
(
self
,
path
,
cache_size
=
2
**
28
,
block_size
=
2
**
20
):
self
.
fp
=
open
(
path
,
"r"
,
encoding
=
"utf-8"
)
self
.
length
=
0
self
.
offsets
=
[
0
]
self
.
cache
=
OrderedDict
()
self
.
cache_size
=
cache_size
self
.
block_size
=
block_size
self
.
bucket_size
=
cache_size
//
block_size
self
.
idx
=
0
while
True
:
text
=
self
.
fp
.
read
(
self
.
block_size
)
if
not
text
:
break
self
.
length
+=
len
(
text
)
self
.
offsets
.
append
(
self
.
fp
.
tell
())
def
__len__
(
self
):
return
self
.
length
def
__iter__
(
self
):
return
self
def
__next__
(
self
):
char
=
self
.
get
()
self
.
next
()
return
char
def
move
(
self
,
idx
):
self
.
idx
=
idx
def
next
(
self
):
self
.
idx
=
(
self
.
idx
+
1
)
%
self
.
length
def
prev
(
self
):
self
.
idx
=
(
self
.
idx
-
1
)
%
self
.
length
def
get
(
self
):
key
=
self
.
idx
//
self
.
block_size
if
key
in
self
.
cache
:
text
=
self
.
cache
[
key
]
else
:
if
len
(
self
.
cache
)
>=
self
.
bucket_size
:
self
.
cache
.
popitem
(
last
=
False
)
offset
=
self
.
offsets
[
key
]
self
.
fp
.
seek
(
offset
,
0
)
text
=
self
.
fp
.
read
(
self
.
block_size
)
self
.
cache
[
key
]
=
text
self
.
cache
.
move_to_end
(
key
)
char
=
text
[
self
.
idx
%
self
.
block_size
]
return
char
class
Content
:
class
Content
:
...
...
synthdog/utils/__init__.py
deleted
100755 → 0
View file @
ad037f89
"""
Donut
Copyright (c) 2022-present NAVER Corp.
MIT License
"""
from
utils.text_reader
import
TextReader
__all__
=
[
"TextReader"
]
synthdog/utils/text_reader.py
deleted
100755 → 0
View file @
ad037f89
"""
Donut
Copyright (c) 2022-present NAVER Corp.
MIT License
"""
from
collections
import
OrderedDict
class
TextReader
:
def
__init__
(
self
,
path
,
cache_size
=
2
**
28
,
block_size
=
2
**
20
):
self
.
fp
=
open
(
path
,
"r"
,
encoding
=
"utf-8"
)
self
.
length
=
0
self
.
offsets
=
[
0
]
self
.
cache
=
OrderedDict
()
self
.
cache_size
=
cache_size
self
.
block_size
=
block_size
self
.
bucket_size
=
cache_size
//
block_size
self
.
idx
=
0
while
True
:
text
=
self
.
fp
.
read
(
self
.
block_size
)
if
not
text
:
break
self
.
length
+=
len
(
text
)
self
.
offsets
.
append
(
self
.
fp
.
tell
())
def
__len__
(
self
):
return
self
.
length
def
__iter__
(
self
):
return
self
def
__next__
(
self
):
char
=
self
.
get
()
self
.
next
()
return
char
def
move
(
self
,
idx
):
self
.
idx
=
idx
def
next
(
self
):
self
.
idx
=
(
self
.
idx
+
1
)
%
self
.
length
def
prev
(
self
):
self
.
idx
=
(
self
.
idx
-
1
)
%
self
.
length
def
get
(
self
):
key
=
self
.
idx
//
self
.
block_size
if
key
in
self
.
cache
:
text
=
self
.
cache
[
key
]
else
:
if
len
(
self
.
cache
)
>=
self
.
bucket_size
:
self
.
cache
.
popitem
(
last
=
False
)
offset
=
self
.
offsets
[
key
]
self
.
fp
.
seek
(
offset
,
0
)
text
=
self
.
fp
.
read
(
self
.
block_size
)
self
.
cache
[
key
]
=
text
self
.
cache
.
move_to_end
(
key
)
char
=
text
[
self
.
idx
%
self
.
block_size
]
return
char
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment