text_process.py 2.65 KB
Newer Older
liugh5's avatar
liugh5 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import logging
import os
import sys
import argparse
import yaml
import time
import zipfile

ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # NOQA: E402
sys.path.insert(0, os.path.dirname(ROOT_PATH))  # NOQA: E402

try:
    from kantts.datasets.dataset import BERT_Text_Dataset
    from kantts.utils.log import logging_to_file, get_git_revision_hash
    from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols
except ImportError:
    raise ImportError("Please install kantts.")

logging.basicConfig(
    format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
    datefmt="%Y-%m-%d:%H:%M:%S",
    level=logging.INFO,
)


def gen_metafile(
    output_dir,
    split_ratio=0.98,
):
    raw_metafile = os.path.join(output_dir, "raw_metafile.txt")
    bert_train_meta = os.path.join(output_dir, "bert_train.lst")
    bert_valid_meta = os.path.join(output_dir, "bert_valid.lst")
    if not os.path.exists(
            bert_train_meta) or not os.path.exists(bert_valid_meta):
        BERT_Text_Dataset.gen_metafile(raw_metafile, output_dir, split_ratio)
        logging.info("BERT Text metafile generated.")

#  TODO: Zh-CN as default
def process_mit_style_data(
    text_file,
    resources_zip_file,
    output_dir,
):
    os.makedirs(output_dir, exist_ok=True)
    logging_to_file(os.path.join(output_dir, "data_process_stdout.log"))

    resource_root_dir = os.path.dirname(resources_zip_file)
    resource_dir = os.path.join(resource_root_dir, "resource")

    if not os.path.exists(resource_dir):
        logging.info("Extracting resources...")
        with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
            zip_ref.extractall(resource_root_dir)

    with open(text_file, "r") as text_data:
        texts = text_data.readlines()

    logging.info("Converting text to symbols...")
    symbols_lst = text_to_symbols(texts, resource_dir, "F7")
    symbols_file = os.path.join(output_dir, "raw_metafile.txt")
    with open(symbols_file, "w") as symbol_data:
        for symbol in symbols_lst:
            symbol_data.write(symbol)

    logging.info("Processing done.")

    # Generate BERT Text metafile
    # TODO: train/valid ratio setting
    gen_metafile(output_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Dataset preprocessor")
    parser.add_argument("--text_file", type=str, required=True)
    parser.add_argument("--resources_zip_file", type=str, required=True)
    parser.add_argument("--output_dir", type=str, required=True)

    args = parser.parse_args()

    process_mit_style_data(
        args.text_file,
        args.resources_zip_file,
        args.output_dir,
    )