doccano.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import os

import numpy as np
from utils import concate_aspect_and_opinion, decoding, save_dict, save_examples


def doccano2SA(doccano_file, save_ext_dir, save_cls_dir, splits=[0.8, 0.9], is_shuffle=True):
    """
    @Description: Consvert doccano file to data format which is suitable to input to this Application.
    @Param doccano_file: The annotated file exported from doccano labeling platform.
    @Param save_ext_dir: The directory of ext data that you wanna save.
    @Param save_cls_dir: The directory of cls data that you wanna save.
    @Param splits: Whether to split doccano file into train/dev/test, note: Only []/ len(splits)==2 accepted.
    @Param is_shuffle: Whether to shuffle data.
    """
    if not os.path.exists(doccano_file):
        raise ValueError("Please input the correct path of doccano file.")

    if not os.path.exists(save_ext_dir):
        os.makedirs(save_ext_dir)

    if not os.path.exists(save_cls_dir):
        os.makedirs(save_cls_dir)

    if len(splits) != 0 and len(splits) != 2:
        raise ValueError("Only []/ len(splits)==2 accepted for splits.")

    if splits and (
        splits[0] >= splits[1] or splits[0] >= 1.0 or splits[1] >= 1.0 or splits[0] <= 0.0 or splits[1] <= 0
    ):
        raise ValueError("Please set correct splits, the element in it should be in (0,1), and splits[1]>splits[0].")

    def label_ext_with_label_term(ext_label, start, end, tag):

        if tag == "Opinion":
            b_tag = "B-Opinion"
            i_tag = "I-Opinion"
        else:
            b_tag = "B-Aspect"
            i_tag = "I-Aspect"

        ext_label[start] = b_tag
        for i in range(start + 1, end):
            ext_label[i] = i_tag

    ext_examples, cls_examples = [], []
    with open(doccano_file, "r", encoding="utf-8") as f:
        raw_examples = f.readlines()
    # start to label for ext and cls data
    for line in raw_examples:
        items = json.loads(line)
        text, label_terms = items["data"], items["label"]
        # label ext data with label_terms
        ext_label = ["O"] * len(text)
        aspect_mapper = {}
        for label_term in label_terms:
            start, end, tag = label_term
            label_ext_with_label_term(ext_label, start, end, tag)
            if tag == "Pos-Aspect":
                aspect_mapper[text[start:end]] = "1"
            elif tag == "Neg-Aspect":
                aspect_mapper[text[start:end]] = "0"
        ext_examples.append((text, " ".join(ext_label)))
        # label cls data
        aps = decoding(text, ext_label)
        for ap in aps:
            aspect, opinions = ap[0], list(set(ap[1:]))
            if aspect not in aspect_mapper:
                continue
            aspect_text = concate_aspect_and_opinion(text, aspect, opinions)
            cls_examples.append((aspect_mapper[aspect], aspect_text, text))

    # index for saving data
    ext_idx = np.arange(len(ext_examples))
    cls_idx = np.arange(len(cls_examples))

    if is_shuffle:
        ext_idx = np.random.permutation(ext_idx)
        cls_idx = np.random.permutation(cls_idx)

    if len(splits) == 0:
        # save ext data
        save_ext_path = os.path.join(save_ext_dir, "doccano.txt")
        save_examples(ext_examples, save_ext_path, ext_idx)
        print(f"\next: save data to {save_ext_path}.")
        # save cls data
        save_cls_path = os.path.join(save_cls_dir, "doccano.txt")
        save_examples(cls_examples, save_cls_path, cls_idx)
        print(f"\ncls: save data to {save_cls_path}.")

    else:
        # save ext data
        eth1, eth2 = int(len(ext_examples) * splits[0]), int(len(ext_examples) * splits[1])
        save_ext_train_path = os.path.join(save_ext_dir, "train.txt")
        save_ext_dev_path = os.path.join(save_ext_dir, "dev.txt")
        save_ext_test_path = os.path.join(save_ext_dir, "test.txt")
        save_examples(ext_examples, save_ext_train_path, ext_idx[:eth1])
        save_examples(ext_examples, save_ext_dev_path, ext_idx[eth1:eth2])
        save_examples(ext_examples, save_ext_test_path, ext_idx[eth2:])
        print(f"\next: save train data to {save_ext_train_path}.")
        print(f"ext: save dev data to {save_ext_dev_path}.")
        print(f"ext: save test data to {save_ext_test_path}.")

        # save cls data
        cth1, cth2 = int(len(cls_examples) * splits[0]), int(len(cls_examples) * splits[1])
        save_cls_train_path = os.path.join(save_cls_dir, "train.txt")
        save_cls_dev_path = os.path.join(save_cls_dir, "dev.txt")
        save_cls_test_path = os.path.join(save_cls_dir, "test.txt")
        save_examples(cls_examples, save_cls_train_path, cls_idx[:cth1])
        save_examples(cls_examples, save_cls_dev_path, cls_idx[cth1:cth2])
        save_examples(cls_examples, save_cls_test_path, cls_idx[cth2:])
        print(f"\ncls: save train data to {save_cls_train_path}.")
        print(f"cls: save dev data to {save_cls_dev_path}.")
        print(f"cls: save test data to {save_cls_test_path}.")

    # save ext dict
    ext_dict_path = os.path.join(save_ext_dir, "label.dict")
    cls_dict_path = os.path.join(save_cls_dir, "label.dict")
    save_dict(ext_dict_path, "ext")
    save_dict(cls_dict_path, "cls")
    print(f"\next: save dict to {ext_dict_path}.")
    print(f"cls: save dict to {cls_dict_path}.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--doccano_file",
        type=str,
        default="./data/doccano.json",
        help="The doccano file exported from doccano platform.",
    )
    parser.add_argument(
        "--save_ext_dir", type=str, default="./data/ext_data1", help="The path of ext data that you wanna save."
    )
    parser.add_argument(
        "--save_cls_dir", type=str, default="./data/cls_data1", help="The path of cls data that you wanna save."
    )
    args = parser.parse_args()

    doccano2SA(args.doccano_file, args.save_ext_dir, args.save_cls_dir, is_shuffle=True)