# coding=utf-8 # Copyright 2021 The OneFlow Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os from .utils import DataProcessor, EncodePattern, InputExample, InputFeatures logger = logging.getLogger(__name__) def clue_convert_examples_to_features( examples, tokenizer, max_length, task=None, pattern=EncodePattern.bert_pattern, label_list=None, output_mode=None, ): if task is not None: processor = clue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info(f"Using label list {label_list} for task {task}") if output_mode is None: output_mode = clue_output_modes[task] logger.info(f"Using output mode {output_mode} for task {task}") label_map = {label: i for i, label in enumerate(label_list)} start_token = [] if tokenizer.start_token is None else [tokenizer.start_token] end_token = [] if tokenizer.end_token is None else [tokenizer.end_token] pad_id = tokenizer.pad_token_id if pattern == EncodePattern.bert_pattern: added_special_tokens = [2, 3] elif pattern == EncodePattern.roberta_pattern: added_special_tokens = [2, 4] else: raise KeyError("pattern is not a valid EncodePattern") features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) _truncate_seq_pair(tokens_a, tokens_b, max_length - added_special_tokens[1]) else: if len(tokens_a) > max_length - added_special_tokens[0]: tokens_a = tokens_a[: (max_length - added_special_tokens[0])] if pattern is EncodePattern.bert_pattern: tokens = start_token + tokens_a + end_token token_type_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + end_token token_type_ids += [1] * (len(tokens) - len(token_type_ids)) elif pattern is EncodePattern.roberta_pattern: tokens = start_token + tokens_a + end_token token_type_ids = [0] * len(tokens) if tokens_b: tokens += end_token + tokens_b + end_token token_type_ids += [1] * (len(tokens) - len(token_type_ids)) else: raise KeyError("pattern is not a valid EncodePattern") input_ids = tokenizer.convert_tokens_to_ids(tokens) attention_mask = [1] * len(input_ids) padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) label = None if example.label is not None: if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label, ) ) return features def _truncate_seq_pair(tokens_a, tokens_b, max_length): while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() class TnewsProcessor(DataProcessor): """Processor for the TNEWS data set (CLUE version). Single sentence classification task. The task is to predict which category the title belongs to. """ def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" labels = [] for i in range(17): if i == 5 or i == 11: continue labels.append(str(100 + i)) return labels def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" text_a = line["sentence"] label = None if set_type == "test" else str(line["label"]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples class IflytekProcessor(DataProcessor): """Processor for the IFLYTEK data set (CLUE version). Single sentence classification task. The task is to predict the categories according to discription. """ def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" labels = [] for i in range(119): labels.append(str(i)) return labels def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" text_a = line["sentence"] label = None if set_type == "test" else str(line["label"]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples class AfqmcProcessor(DataProcessor): """Processor for the AFQMC data set (CLUE version). Sentence pair classification task. This task is to predict whether two sentences are semantically similar. """ def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" text_a = line["sentence1"] text_b = line["sentence2"] label = None if set_type == "test" else str(line["label"]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class OcnliProcessor(DataProcessor): """Processor for the OCNLI data set (CLUE version). Sentence pair classification task. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). """ def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" return ["contradiction", "entailment", "neutral"] def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" text_a = line["sentence1"] text_b = line["sentence2"] label = None if set_type == "test" else str(line["label"]) if label.strip() == "-": continue examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class CmnliProcessor(DataProcessor): """Processor for the CMNLI data set (CLUE version). Sentence pair classification task. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). """ def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" return ["contradiction", "entailment", "neutral"] def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" text_a = line["sentence1"] text_b = line["sentence2"] label = None if set_type == "test" else str(line["label"]) if label.strip() == "-": continue examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class CslProcessor(DataProcessor): """Processor for the CSL data set (CLUE version).""" def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" text_a = " ".join(line["keyword"]) text_b = line["abst"] label = None if set_type == "test" else str(line["label"]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class WscProcessor(DataProcessor): """Processor for the WSC data set (CLUE version).""" def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" return ["true", "false"] def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" text_a = line["text"] text_a_list = list(text_a) target = line["target"] query = target["span1_text"] query_idx = target["span1_index"] pronoun = target["span2_text"] pronoun_idx = target["span2_index"] assert ( text_a[pronoun_idx : (pronoun_idx + len(pronoun))] == pronoun ), "pronoun: {}".format(pronoun) assert text_a[query_idx : (query_idx + len(query))] == query, "query: {}".format(query) if pronoun_idx > query_idx: text_a_list.insert(query_idx, "_") text_a_list.insert(query_idx + len(query) + 1, "_") text_a_list.insert(pronoun_idx + 2, "[") text_a_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]") else: text_a_list.insert(pronoun_idx, "[") text_a_list.insert(pronoun_idx + len(pronoun) + 1, "]") text_a_list.insert(query_idx + 2, "_") text_a_list.insert(query_idx + len(query) + 2 + 1, "_") text_a = "".join(text_a_list) label = None if set_type == "test" else str(line["label"]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples class CopaProcessor(DataProcessor): """Processor for the COPA data set (CLUE version).""" def get_train_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "train.json")), "train") def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "dev.json")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples(self._read_json(os.path.join(data_dir, "test.json")), "test") def get_labels(self): """See base class.""" return ["0", "1"] def _create_examples(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): i = 2 * i guid1 = f"{set_type}-{i}" guid2 = "%s-%s" % (set_type, i + 1) premise = line["premise"] choice0 = line["choice0"] label = None if set_type == "test" else str(1 if line["label"] == 0 else 0) choice1 = line["choice1"] label2 = None if set_type == "test" else str(1 if line["label"] == 0 else 0) if line["question"] == "effect": text_a = premise text_b = choice0 text_a2 = premise text_b2 = choice1 elif line["question"] == "cause": text_a = choice0 text_b = premise text_a2 = choice1 text_b2 = premise else: raise ValueError(f'unknowed {line["question"]} type') examples.append(InputExample(guid=guid1, text_a=text_a, text_b=text_b, label=label)) examples.append(InputExample(guid=guid2, text_a=text_a2, text_b=text_b2, label=label2)) return examples def _create_examples_version2(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = f"{set_type}-{i}" if line["question"] == "cause": text_a = line["premise"] + "这是什么原因造成的?" + line["choice0"] text_b = line["premise"] + "这是什么原因造成的?" + line["choice1"] else: text_a = line["premise"] + "这造成了什么影响?" + line["choice0"] text_b = line["premise"] + "这造成了什么影响?" + line["choice1"] label = None if set_type == "test" else str(1 if line["label"] == 0 else 0) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples clue_tasks_num_labels = { "iflytek": 119, "cmnli": 3, "ocnli": 3, "afqmc": 2, "csl": 2, "wsc": 2, "copa": 2, "tnews": 15, } clue_processors = { "tnews": TnewsProcessor, "iflytek": IflytekProcessor, "cmnli": CmnliProcessor, "ocnli": OcnliProcessor, "afqmc": AfqmcProcessor, "csl": CslProcessor, "wsc": WscProcessor, "copa": CopaProcessor, } clue_output_modes = { "tnews": "classification", "iflytek": "classification", "cmnli": "classification", "ocnli": "classification", "afqmc": "classification", "csl": "classification", "wsc": "classification", "copa": "classification", }