"sgl-kernel/csrc/vscode:/vscode.git/clone" did not exist on "2a936a841ec1e7f3a3a2d9cb90e09fbc1f683915"
doccano.py 6.45 KB
Newer Older
yuguo-Jack's avatar
yuguo-Jack committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import os

import numpy as np
from utils import concate_aspect_and_opinion, decoding, save_dict, save_examples


def doccano2SA(doccano_file, save_ext_dir, save_cls_dir, splits=[0.8, 0.9], is_shuffle=True):
    """
    @Description: Consvert doccano file to data format which is suitable to input to this Application.
    @Param doccano_file: The annotated file exported from doccano labeling platform.
    @Param save_ext_dir: The directory of ext data that you wanna save.
    @Param save_cls_dir: The directory of cls data that you wanna save.
    @Param splits: Whether to split doccano file into train/dev/test, note: Only []/ len(splits)==2 accepted.
    @Param is_shuffle: Whether to shuffle data.
    """
    if not os.path.exists(doccano_file):
        raise ValueError("Please input the correct path of doccano file.")

    if not os.path.exists(save_ext_dir):
        os.makedirs(save_ext_dir)

    if not os.path.exists(save_cls_dir):
        os.makedirs(save_cls_dir)

    if len(splits) != 0 and len(splits) != 2:
        raise ValueError("Only []/ len(splits)==2 accepted for splits.")

    if splits and (
        splits[0] >= splits[1] or splits[0] >= 1.0 or splits[1] >= 1.0 or splits[0] <= 0.0 or splits[1] <= 0
    ):
        raise ValueError("Please set correct splits, the element in it should be in (0,1), and splits[1]>splits[0].")

    def label_ext_with_label_term(ext_label, start, end, tag):

        if tag == "Opinion":
            b_tag = "B-Opinion"
            i_tag = "I-Opinion"
        else:
            b_tag = "B-Aspect"
            i_tag = "I-Aspect"

        ext_label[start] = b_tag
        for i in range(start + 1, end):
            ext_label[i] = i_tag

    ext_examples, cls_examples = [], []
    with open(doccano_file, "r", encoding="utf-8") as f:
        raw_examples = f.readlines()
    # start to label for ext and cls data
    for line in raw_examples:
        items = json.loads(line)
        text, label_terms = items["data"], items["label"]
        # label ext data with label_terms
        ext_label = ["O"] * len(text)
        aspect_mapper = {}
        for label_term in label_terms:
            start, end, tag = label_term
            label_ext_with_label_term(ext_label, start, end, tag)
            if tag == "Pos-Aspect":
                aspect_mapper[text[start:end]] = "1"
            elif tag == "Neg-Aspect":
                aspect_mapper[text[start:end]] = "0"
        ext_examples.append((text, " ".join(ext_label)))
        # label cls data
        aps = decoding(text, ext_label)
        for ap in aps:
            aspect, opinions = ap[0], list(set(ap[1:]))
            if aspect not in aspect_mapper:
                continue
            aspect_text = concate_aspect_and_opinion(text, aspect, opinions)
            cls_examples.append((aspect_mapper[aspect], aspect_text, text))

    # index for saving data
    ext_idx = np.arange(len(ext_examples))
    cls_idx = np.arange(len(cls_examples))

    if is_shuffle:
        ext_idx = np.random.permutation(ext_idx)
        cls_idx = np.random.permutation(cls_idx)

    if len(splits) == 0:
        # save ext data
        save_ext_path = os.path.join(save_ext_dir, "doccano.txt")
        save_examples(ext_examples, save_ext_path, ext_idx)
        print(f"\next: save data to {save_ext_path}.")
        # save cls data
        save_cls_path = os.path.join(save_cls_dir, "doccano.txt")
        save_examples(cls_examples, save_cls_path, cls_idx)
        print(f"\ncls: save data to {save_cls_path}.")

    else:
        # save ext data
        eth1, eth2 = int(len(ext_examples) * splits[0]), int(len(ext_examples) * splits[1])
        save_ext_train_path = os.path.join(save_ext_dir, "train.txt")
        save_ext_dev_path = os.path.join(save_ext_dir, "dev.txt")
        save_ext_test_path = os.path.join(save_ext_dir, "test.txt")
        save_examples(ext_examples, save_ext_train_path, ext_idx[:eth1])
        save_examples(ext_examples, save_ext_dev_path, ext_idx[eth1:eth2])
        save_examples(ext_examples, save_ext_test_path, ext_idx[eth2:])
        print(f"\next: save train data to {save_ext_train_path}.")
        print(f"ext: save dev data to {save_ext_dev_path}.")
        print(f"ext: save test data to {save_ext_test_path}.")

        # save cls data
        cth1, cth2 = int(len(cls_examples) * splits[0]), int(len(cls_examples) * splits[1])
        save_cls_train_path = os.path.join(save_cls_dir, "train.txt")
        save_cls_dev_path = os.path.join(save_cls_dir, "dev.txt")
        save_cls_test_path = os.path.join(save_cls_dir, "test.txt")
        save_examples(cls_examples, save_cls_train_path, cls_idx[:cth1])
        save_examples(cls_examples, save_cls_dev_path, cls_idx[cth1:cth2])
        save_examples(cls_examples, save_cls_test_path, cls_idx[cth2:])
        print(f"\ncls: save train data to {save_cls_train_path}.")
        print(f"cls: save dev data to {save_cls_dev_path}.")
        print(f"cls: save test data to {save_cls_test_path}.")

    # save ext dict
    ext_dict_path = os.path.join(save_ext_dir, "label.dict")
    cls_dict_path = os.path.join(save_cls_dir, "label.dict")
    save_dict(ext_dict_path, "ext")
    save_dict(cls_dict_path, "cls")
    print(f"\next: save dict to {ext_dict_path}.")
    print(f"cls: save dict to {cls_dict_path}.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--doccano_file",
        type=str,
        default="./data/doccano.json",
        help="The doccano file exported from doccano platform.",
    )
    parser.add_argument(
        "--save_ext_dir", type=str, default="./data/ext_data1", help="The path of ext data that you wanna save."
    )
    parser.add_argument(
        "--save_cls_dir", type=str, default="./data/cls_data1", help="The path of cls data that you wanna save."
    )
    args = parser.parse_args()

    doccano2SA(args.doccano_file, args.save_ext_dir, args.save_cls_dir, is_shuffle=True)