# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import re import paddle from utils import decoding, load_dict from paddlenlp.transformers import ( SkepForSequenceClassification, SkepForTokenClassification, SkepTokenizer, ) def is_aspect_first(text, aspect, opinion_word): return text.find(aspect) <= text.find(opinion_word) def concate_aspect_and_opinion(text, aspect, opinion_words): aspect_text = "" for opinion_word in opinion_words: if is_aspect_first(text, aspect, opinion_word): aspect_text += aspect + opinion_word + "," else: aspect_text += opinion_word + aspect + "," aspect_text = aspect_text[:-1] return aspect_text def format_print(results): for result in results: aspect, opinions, sentiment = result["aspect"], result["opinions"], result["sentiment_polarity"] print(f"aspect: {aspect}, opinions: {opinions}, sentiment_polarity: {sentiment}") print() def predict(args, ext_model, cls_model, tokenizer, ext_id2label, cls_id2label): ext_model.eval() cls_model.eval() while True: input_text = input("input text: \n") input_text = re.sub(" +", "", input_text.strip()) if not input_text: continue if input_text == "quit" or input_text == "exit": break input_text = input_text.strip().replace(" ", "") # processing input text encoded_inputs = tokenizer(list(input_text), is_split_into_words=True, max_seq_len=args.ext_max_seq_len) input_ids = paddle.to_tensor([encoded_inputs["input_ids"]]) token_type_ids = paddle.to_tensor([encoded_inputs["token_type_ids"]]) # extract aspect and opinion words logits = ext_model(input_ids, token_type_ids=token_type_ids) predictions = logits.argmax(axis=2).numpy()[0] tag_seq = [ext_id2label[idx] for idx in predictions][1:-1] aps = decoding(input_text[: args.ext_max_seq_len - 2], tag_seq) # predict sentiment for aspect with cls_model results = [] for ap in aps: aspect = ap[0] opinion_words = list(set(ap[1:])) aspect_text = concate_aspect_and_opinion(input_text, aspect, opinion_words) encoded_inputs = tokenizer( aspect_text, text_pair=input_text, max_seq_len=args.cls_max_seq_len, return_length=True ) input_ids = paddle.to_tensor([encoded_inputs["input_ids"]]) token_type_ids = paddle.to_tensor([encoded_inputs["token_type_ids"]]) logits = cls_model(input_ids, token_type_ids=token_type_ids) prediction = int(logits.argmax(axis=1)) result = {"aspect": aspect, "opinions": opinion_words, "sentiment_polarity": cls_id2label[prediction]} results.append(result) format_print(results) if __name__ == "__main__": # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--ext_model_path", type=str, default=None, help="The path of extraction model path that you want to load.") parser.add_argument("--cls_model_path", type=str, default=None, help="The path of classification model path that you want to load.") parser.add_argument("--ext_label_path", type=str, default=None, help="The path of extraction label dict.") parser.add_argument("--cls_label_path", type=str, default=None, help="The path of classification label dict.") parser.add_argument("--ext_max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization for extraction model.") parser.add_argument("--cls_max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization for classification model.") args = parser.parse_args() # yapf: enbale # load dict model_name = "skep_ernie_1.0_large_ch" ext_label2id, ext_id2label = load_dict(args.ext_label_path) cls_label2id, cls_id2label = load_dict(args.cls_label_path) tokenizer = SkepTokenizer.from_pretrained(model_name) print("label dict loaded.") # load ext model ext_state_dict = paddle.load(args.ext_model_path) ext_model = SkepForTokenClassification.from_pretrained(model_name, num_classes=len(ext_label2id)) ext_model.load_dict(ext_state_dict) print("extraction model loaded.") # load cls model cls_state_dict = paddle.load(args.cls_model_path) cls_model = SkepForSequenceClassification.from_pretrained(model_name, num_classes=len(cls_label2id)) cls_model.load_dict(cls_state_dict) print("classification model loaded.") # do predict predict(args, ext_model, cls_model, tokenizer, ext_id2label, cls_id2label)