refac: SynthDoG and README.md

8f2d5153 · Geewook Kim · ad037f89 · 8f2d5153 · 8f2d5153 · ad037f89
Commit 8f2d5153 authored Aug 24, 2022 by Geewook Kim
4 changed files
--- a/synthdog/README.md
+++ b/synthdog/README.md
@@ -15,12 +15,12 @@ SynthDoG is synthetic document generator for visual document understanding (VDU)
 # Set environment variable (for macOS)
 $ export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
-synthtiger -o {dataset_path}/SynthDoG_en -c 100 -w 4 -v template.py SynthDog config_en.yaml
+synthtiger -o ./outputs/SynthDoG_en -c 50 -w 4 -v template.py SynthDoG config_en.yaml
 {'config': 'config_en.yaml',
- 'count': 100,
+ 'count': 50,
- 'name': 'SynthDog',
+ 'name': 'SynthDoG',
- 'output': 'outputs/SynthDoG_en',
+ 'output': './outputs/SynthDoG_en',
 'script': 'template.py',
 'verbose': True,
 'worker': 4}
@@ -34,22 +34,29 @@ Generated 2 data
 Generated 3 data
     .
     .
-Generated 99 data
+Generated 49 data
-Generated 100 data
+Generated 50 data
-108.74 seconds elapsed
+46.32 seconds elapsed
 ```
+Some important arguments:
+- `-o` : directory path to save data.
+- `-c` : number of data to generate.
+- `-w` : number of workers.
+- `-v` : print error messages.
 To generate ECJK samples:
 ```bash
 # english
-synthtiger -o {dataset_path}/synthdog-en -w 4 -v template.py SynthDoG config_en.yaml
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_en.yaml
 # chinese
-synthtiger -o {dataset_path}/synthdog-zh -w 4 -v template.py SynthDoG config_zh.yaml
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_zh.yaml
 # japanese
-synthtiger -o {dataset_path}/synthdog-ja -w 4 -v template.py SynthDoG config_ja.yaml
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ja.yaml
 # korean
-synthtiger -o {dataset_path}/synthdog-ko -w 4 -v template.py SynthDoG config_ko.yaml
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ko.yaml
 ```
--- a/synthdog/elements/content.py
+++ b/synthdog/elements/content.py
@@ -3,12 +3,70 @@ Donut
 Copyright (c) 2022-present NAVER Corp.
 MIT License
 """
+from collections import OrderedDict
 import numpy as np
 from synthtiger import components
 from elements.textbox import TextBox
 from layouts import GridStack
-from utils import TextReader
+class TextReader:
+    def __init__(self, path, cache_size=2 ** 28, block_size=2 ** 20):
+        self.fp = open(path, "r", encoding="utf-8")
+        self.length = 0
+        self.offsets = [0]
+        self.cache = OrderedDict()
+        self.cache_size = cache_size
+        self.block_size = block_size
+        self.bucket_size = cache_size // block_size
+        self.idx = 0
+        while True:
+            text = self.fp.read(self.block_size)
+            if not text:
+                break
+            self.length += len(text)
+            self.offsets.append(self.fp.tell())
+    def __len__(self):
+        return self.length
+    def __iter__(self):
+        return self
+    def __next__(self):
+        char = self.get()
+        self.next()
+        return char
+    def move(self, idx):
+        self.idx = idx
+    def next(self):
+        self.idx = (self.idx + 1) % self.length
+    def prev(self):
+        self.idx = (self.idx - 1) % self.length
+    def get(self):
+        key = self.idx // self.block_size
+        if key in self.cache:
+            text = self.cache[key]
+        else:
+            if len(self.cache) >= self.bucket_size:
+                self.cache.popitem(last=False)
+            offset = self.offsets[key]
+            self.fp.seek(offset, 0)
+            text = self.fp.read(self.block_size)
+            self.cache[key] = text
+        self.cache.move_to_end(key)
+        char = text[self.idx % self.block_size]
+        return char
 class Content:

--- a/synthdog/utils/__init__.py
+++ b/synthdog/utils/__init__.py
-"""
-Donut
-Copyright (c) 2022-present NAVER Corp.
-MIT License
-"""
-from utils.text_reader import TextReader
-__all__ = ["TextReader"]
--- a/synthdog/utils/text_reader.py
+++ b/synthdog/utils/text_reader.py
-"""
-Donut
-Copyright (c) 2022-present NAVER Corp.
-MIT License
-"""
-from collections import OrderedDict
-class TextReader:
-    def __init__(self, path, cache_size=2 ** 28, block_size=2 ** 20):
-        self.fp = open(path, "r", encoding="utf-8")
-        self.length = 0
-        self.offsets = [0]
-        self.cache = OrderedDict()
-        self.cache_size = cache_size
-        self.block_size = block_size
-        self.bucket_size = cache_size // block_size
-        self.idx = 0
-        while True:
-            text = self.fp.read(self.block_size)
-            if not text:
-                break
-            self.length += len(text)
-            self.offsets.append(self.fp.tell())
-    def __len__(self):
-        return self.length
-    def __iter__(self):
-        return self
-    def __next__(self):
-        char = self.get()
-        self.next()
-        return char
-    def move(self, idx):
-        self.idx = idx
-    def next(self):
-        self.idx = (self.idx + 1) % self.length
-    def prev(self):
-        self.idx = (self.idx - 1) % self.length
-    def get(self):
-        key = self.idx // self.block_size
-        if key in self.cache:
-            text = self.cache[key]
-        else:
-            if len(self.cache) >= self.bucket_size:
-                self.cache.popitem(last=False)
-            offset = self.offsets[key]
-            self.fp.seek(offset, 0)
-            text = self.fp.read(self.block_size)
-            self.cache[key] = text
-        self.cache.move_to_end(key)
-        char = text[self.idx % self.block_size]
-        return char