"...resnet50_tensorflow.git" did not exist on "d4bc61609bb9263d6bd891166437853da8917320"
Commit 8f2d5153 authored by Geewook Kim's avatar Geewook Kim
Browse files

refac: SynthDoG and README.md

parent ad037f89
...@@ -15,12 +15,12 @@ SynthDoG is synthetic document generator for visual document understanding (VDU) ...@@ -15,12 +15,12 @@ SynthDoG is synthetic document generator for visual document understanding (VDU)
# Set environment variable (for macOS) # Set environment variable (for macOS)
$ export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES $ export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
synthtiger -o {dataset_path}/SynthDoG_en -c 100 -w 4 -v template.py SynthDog config_en.yaml synthtiger -o ./outputs/SynthDoG_en -c 50 -w 4 -v template.py SynthDoG config_en.yaml
{'config': 'config_en.yaml', {'config': 'config_en.yaml',
'count': 100, 'count': 50,
'name': 'SynthDog', 'name': 'SynthDoG',
'output': 'outputs/SynthDoG_en', 'output': './outputs/SynthDoG_en',
'script': 'template.py', 'script': 'template.py',
'verbose': True, 'verbose': True,
'worker': 4} 'worker': 4}
...@@ -34,22 +34,29 @@ Generated 2 data ...@@ -34,22 +34,29 @@ Generated 2 data
Generated 3 data Generated 3 data
. .
. .
Generated 99 data Generated 49 data
Generated 100 data Generated 50 data
108.74 seconds elapsed 46.32 seconds elapsed
``` ```
Some important arguments:
- `-o` : directory path to save data.
- `-c` : number of data to generate.
- `-w` : number of workers.
- `-v` : print error messages.
To generate ECJK samples: To generate ECJK samples:
```bash ```bash
# english # english
synthtiger -o {dataset_path}/synthdog-en -w 4 -v template.py SynthDoG config_en.yaml synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_en.yaml
# chinese # chinese
synthtiger -o {dataset_path}/synthdog-zh -w 4 -v template.py SynthDoG config_zh.yaml synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_zh.yaml
# japanese # japanese
synthtiger -o {dataset_path}/synthdog-ja -w 4 -v template.py SynthDoG config_ja.yaml synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ja.yaml
# korean # korean
synthtiger -o {dataset_path}/synthdog-ko -w 4 -v template.py SynthDoG config_ko.yaml synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ko.yaml
``` ```
...@@ -3,12 +3,70 @@ Donut ...@@ -3,12 +3,70 @@ Donut
Copyright (c) 2022-present NAVER Corp. Copyright (c) 2022-present NAVER Corp.
MIT License MIT License
""" """
from collections import OrderedDict
import numpy as np import numpy as np
from synthtiger import components from synthtiger import components
from elements.textbox import TextBox from elements.textbox import TextBox
from layouts import GridStack from layouts import GridStack
from utils import TextReader
class TextReader:
def __init__(self, path, cache_size=2 ** 28, block_size=2 ** 20):
self.fp = open(path, "r", encoding="utf-8")
self.length = 0
self.offsets = [0]
self.cache = OrderedDict()
self.cache_size = cache_size
self.block_size = block_size
self.bucket_size = cache_size // block_size
self.idx = 0
while True:
text = self.fp.read(self.block_size)
if not text:
break
self.length += len(text)
self.offsets.append(self.fp.tell())
def __len__(self):
return self.length
def __iter__(self):
return self
def __next__(self):
char = self.get()
self.next()
return char
def move(self, idx):
self.idx = idx
def next(self):
self.idx = (self.idx + 1) % self.length
def prev(self):
self.idx = (self.idx - 1) % self.length
def get(self):
key = self.idx // self.block_size
if key in self.cache:
text = self.cache[key]
else:
if len(self.cache) >= self.bucket_size:
self.cache.popitem(last=False)
offset = self.offsets[key]
self.fp.seek(offset, 0)
text = self.fp.read(self.block_size)
self.cache[key] = text
self.cache.move_to_end(key)
char = text[self.idx % self.block_size]
return char
class Content: class Content:
......
"""
Donut
Copyright (c) 2022-present NAVER Corp.
MIT License
"""
from utils.text_reader import TextReader
__all__ = ["TextReader"]
"""
Donut
Copyright (c) 2022-present NAVER Corp.
MIT License
"""
from collections import OrderedDict
class TextReader:
def __init__(self, path, cache_size=2 ** 28, block_size=2 ** 20):
self.fp = open(path, "r", encoding="utf-8")
self.length = 0
self.offsets = [0]
self.cache = OrderedDict()
self.cache_size = cache_size
self.block_size = block_size
self.bucket_size = cache_size // block_size
self.idx = 0
while True:
text = self.fp.read(self.block_size)
if not text:
break
self.length += len(text)
self.offsets.append(self.fp.tell())
def __len__(self):
return self.length
def __iter__(self):
return self
def __next__(self):
char = self.get()
self.next()
return char
def move(self, idx):
self.idx = idx
def next(self):
self.idx = (self.idx + 1) % self.length
def prev(self):
self.idx = (self.idx - 1) % self.length
def get(self):
key = self.idx // self.block_size
if key in self.cache:
text = self.cache[key]
else:
if len(self.cache) >= self.bucket_size:
self.cache.popitem(last=False)
offset = self.offsets[key]
self.fp.seek(offset, 0)
text = self.fp.read(self.block_size)
self.cache[key] = text
self.cache.move_to_end(key)
char = text[self.idx % self.block_size]
return char
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment