"tests/vscode:/vscode.git/clone" did not exist on "460b844360131c99d3dd4dbd9c08545ea2e6ac9e"
Commit 10f294ff authored by yuguo-Jack's avatar yuguo-Jack
Browse files

llama_paddle

parent 7c64e6ec
Pipeline #678 failed with stages
in 0 seconds
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
from tqdm import tqdm
from paddlenlp import Taskflow
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--answer_generation_model_path', type=str, default=None, help='the model path to be loaded for answer extraction')
parser.add_argument('--question_generation_model_path', type=str, default=None, help='the model path to be loaded for question generation')
parser.add_argument('--filtration_model_path', type=str, default=None, help='the model path to be loaded for filtration')
parser.add_argument('--source_file_path', type=str, default=None, help='the source file path')
parser.add_argument('--target_file_path', type=str, default=None, help='the target json file path')
parser.add_argument('--batch_size', type=int, default=1, help='the batch size when using taskflow')
parser.add_argument("--do_debug", action='store_true', help="Whether to do debug")
parser.add_argument('--a_prompt', type=str, default='答案', help='the prompt when using taskflow, separate by ,')
parser.add_argument('--a_position_prob', type=float, default=0.01, help='confidence threshold for answer extraction')
parser.add_argument('--a_max_answer_candidates', type=int, default=5, help='the max number of return answer candidate for each input')
parser.add_argument('--q_num_return_sequences', type=int, default=3, help='the number of return sequences for each input sample, it should be less than num_beams')
parser.add_argument('--q_max_question_length', type=int, default=50, help='the max decoding length')
parser.add_argument('--q_decode_strategy', type=str, default='sampling', help='the decode strategy')
parser.add_argument('--q_num_beams', type=int, default=6, help='the number of beams when using beam search')
parser.add_argument('--q_num_beam_groups', type=int, default=1, help='the number of beam groups when using diverse beam search')
parser.add_argument('--q_diversity_rate', type=float, default=0.0, help='the diversity_rate when using diverse beam search')
parser.add_argument('--q_top_k', type=float, default=5, help='the top_k when using sampling decoding strategy')
parser.add_argument('--q_top_p', type=float, default=1.0, help='the top_p when using sampling decoding strategy')
parser.add_argument('--q_temperature', type=float, default=1.0, help='the temperature when using sampling decoding strategy')
parser.add_argument("--do_filtration", action='store_true', help="Whether to do filtration")
parser.add_argument('--f_filtration_position_prob', type=float, default=0.1, help='confidence threshold for filtration')
args = parser.parse_args()
return args
# yapf: enable
def answer_generation_from_paragraphs(
paragraphs, batch_size=16, model=None, max_answer_candidates=5, schema=None, wf=None
):
"""Generate answer from given paragraphs."""
result = []
buffer = []
i = 0
len_paragraphs = len(paragraphs)
for paragraph_tobe in tqdm(paragraphs):
buffer.append(paragraph_tobe)
if len(buffer) == batch_size or (i + 1) == len_paragraphs:
predicts = model(buffer)
paragraph_list = buffer
buffer = []
for predict_dict, paragraph in zip(predicts, paragraph_list):
answers = []
probabilitys = []
for prompt in schema:
if prompt in predict_dict:
answer_dicts = predict_dict[prompt]
answers += [answer_dict["text"] for answer_dict in answer_dicts]
probabilitys += [answer_dict["probability"] for answer_dict in answer_dicts]
else:
answers += []
probabilitys += []
candidates = sorted(list(set([(a, p) for a, p in zip(answers, probabilitys)])), key=lambda x: -x[1])
if len(candidates) > max_answer_candidates:
candidates = candidates[:max_answer_candidates]
outdict = {
"context": paragraph,
"answer_candidates": candidates,
}
if wf:
wf.write(json.dumps(outdict, ensure_ascii=False) + "\n")
result.append(outdict)
i += 1
return result
def create_fake_question(
json_file_or_pair_list, out_json=None, num_return_sequences=1, all_sample_num=None, batch_size=8
):
if out_json:
wf = open(out_json, "w", encoding="utf-8")
if isinstance(json_file_or_pair_list, list):
all_lines = json_file_or_pair_list
else:
rf = open(json_file_or_pair_list, "r", encoding="utf-8")
all_lines = []
for json_line in rf:
line_dict = json.loads(json_line)
all_lines.append(line_dict)
rf.close()
num_all_lines = len(all_lines)
output = []
context_buffer = []
answer_buffer = []
answer_probability_buffer = []
true_question_buffer = []
i = 0
for index, line_dict in enumerate(tqdm(all_lines)):
if "question" in line_dict:
q = line_dict["question"]
else:
q = ""
c = line_dict["context"]
assert "answer_candidates" in line_dict
answers = line_dict["answer_candidates"]
if not answers:
continue
for j, pair in enumerate(answers):
a, p = pair
context_buffer += [c]
answer_buffer += [a]
answer_probability_buffer += [p]
true_question_buffer += [q]
if (
(i + 1) % batch_size == 0
or (all_sample_num and (i + 1) == all_sample_num)
or ((index + 1) == num_all_lines and j == len(answers) - 1)
):
result_buffer = question_generation(
[{"context": context, "answer": answer} for context, answer in zip(context_buffer, answer_buffer)]
)
context_buffer_temp, answer_buffer_temp, answer_probability_buffer_temp, true_question_buffer_temp = (
[],
[],
[],
[],
)
for context, answer, answer_probability, true_question in zip(
context_buffer, answer_buffer, answer_probability_buffer, true_question_buffer
):
context_buffer_temp += [context] * num_return_sequences
answer_buffer_temp += [answer] * num_return_sequences
answer_probability_buffer_temp += [answer_probability] * num_return_sequences
true_question_buffer_temp += [true_question] * num_return_sequences
result_one_two_buffer = [(one, two) for one, two in zip(result_buffer[0], result_buffer[1])]
for context, answer, answer_probability, true_question, result in zip(
context_buffer_temp,
answer_buffer_temp,
answer_probability_buffer_temp,
true_question_buffer_temp,
result_one_two_buffer,
):
fake_questions_tokens = [result[0]]
fake_questions_scores = [result[1]]
for fake_questions_token, fake_questions_score in zip(
fake_questions_tokens, fake_questions_scores
):
out_dict = {
"context": context,
"synthetic_answer": answer,
"synthetic_answer_probability": answer_probability,
"synthetic_question": fake_questions_token,
"synthetic_question_probability": fake_questions_score,
"true_question": true_question,
}
if out_json:
wf.write(json.dumps(out_dict, ensure_ascii=False) + "\n")
output.append(out_dict)
context_buffer = []
answer_buffer = []
true_question_buffer = []
if all_sample_num and (i + 1) >= all_sample_num:
break
i += 1
if out_json:
wf.close()
return output
def filtration(paragraphs, batch_size=16, model=None, schema=None, wf=None, wf_debug=None):
result = []
buffer = []
valid_num, invalid_num = 0, 0
i = 0
len_paragraphs = len(paragraphs)
for paragraph_tobe in tqdm(paragraphs):
buffer.append(paragraph_tobe)
if len(buffer) == batch_size or (i + 1) == len_paragraphs:
model_inputs = []
for d in buffer:
context = d["context"]
synthetic_question = d["synthetic_question"]
prefix = "问题:" + synthetic_question + "上下文:"
content = prefix + context
model_inputs.append(content)
predicts = model(model_inputs)
paragraph_list = buffer
buffer = []
for predict_dict, paragraph in zip(predicts, paragraph_list):
context = paragraph["context"]
synthetic_question = paragraph["synthetic_question"]
synthetic_question_probability = paragraph["synthetic_question_probability"]
synthetic_answer = paragraph["synthetic_answer"]
synthetic_answer_probability = paragraph["synthetic_answer_probability"]
answers = []
probabilitys = []
for prompt in schema:
if prompt in predict_dict:
answer_dicts = predict_dict[prompt]
answers += [answer_dict["text"] for answer_dict in answer_dicts]
probabilitys += [answer_dict["probability"] for answer_dict in answer_dicts]
else:
answers += []
probabilitys += []
candidates = [
an for an, pro in sorted([(a, p) for a, p in zip(answers, probabilitys)], key=lambda x: -x[1])
]
out_dict = {
"context": context,
"synthetic_answer": synthetic_answer,
"synthetic_answer_probability": synthetic_answer_probability,
"synthetic_question": synthetic_question,
"synthetic_question_probability": synthetic_question_probability,
}
if synthetic_answer in candidates:
if wf:
wf.write(json.dumps(out_dict, ensure_ascii=False) + "\n")
result.append(out_dict)
valid_num += 1
else:
if wf_debug:
wf_debug.write(json.dumps(out_dict, ensure_ascii=False) + "\n")
invalid_num += 1
i += 1
print("valid synthetic question-answer pairs number:", valid_num)
print("invalid synthetic question-answer pairs number:", invalid_num)
return result
if __name__ == "__main__":
args = parse_args()
assert args.a_prompt
schema = args.a_prompt.strip().split(",")
answer_generator = Taskflow(
"information_extraction",
schema=schema,
task_path=args.answer_generation_model_path,
batch_size=args.batch_size,
position_prob=args.a_position_prob,
)
assert args.source_file_path
paragraphs = []
if args.source_file_path.endswith(".json"):
with open(args.source_file_path, "r", encoding="utf-8") as rf:
for json_line in rf:
line_dict = json.loads(json_line)
assert "context" in line_dict or "content" in line_dict
if "context" in line_dict:
paragraphs.append(line_dict["context"].strip())
elif "content" in line_dict:
paragraphs.append(line_dict["content"].strip())
else:
with open(args.source_file_path, "r", encoding="utf-8") as rf:
for line in rf:
paragraphs.append(line.strip())
synthetic_context_answer_pairs = answer_generation_from_paragraphs(
paragraphs,
batch_size=args.batch_size,
model=answer_generator,
max_answer_candidates=args.a_max_answer_candidates,
schema=schema,
wf=None,
)
print("create synthetic answers successfully!")
question_generation = Taskflow(
"question_generation",
task_path=args.question_generation_model_path,
output_scores=True,
max_length=args.q_max_question_length,
is_select_from_num_return_sequences=False,
num_return_sequences=args.q_num_return_sequences,
batch_size=args.batch_size,
decode_strategy=args.q_decode_strategy,
num_beams=args.q_num_beams,
num_beam_groups=args.q_num_beam_groups,
diversity_rate=args.q_diversity_rate,
top_k=args.q_top_k,
top_p=args.q_top_p,
temperature=args.q_temperature,
)
synthetic_answer_question_pairs = create_fake_question(
synthetic_context_answer_pairs,
None if args.do_filtration else args.target_file_path,
args.q_num_return_sequences,
None,
args.batch_size,
)
print("create synthetic question-answer pairs successfully!")
wf = None
wf_debug = None
if args.target_file_path:
if not os.path.exists(os.path.dirname(args.target_file_path)):
os.makedirs(os.path.dirname(args.target_file_path))
wf = open(args.target_file_path, "w", encoding="utf-8")
if args.do_debug:
wf_debug = open(args.target_file_path + ".debug.json", "w", encoding="utf-8")
if args.do_filtration:
filtration_model = Taskflow(
"information_extraction",
schema=["答案"],
task_path=args.filtration_model_path,
batch_size=args.batch_size,
position_prob=args.f_filtration_position_prob,
)
filtration(
synthetic_answer_question_pairs,
batch_size=16,
model=filtration_model,
schema=["答案"],
wf=wf,
wf_debug=wf_debug,
)
print("filter synthetic question-answer pairs successfully!")
rf.close()
wf.close()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
from tqdm import tqdm
from paddlenlp import Taskflow
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--model_path', type=str, default=None, help='the model path to be loaded for question_generation taskflow')
parser.add_argument('--source_file_path', type=str, default=None, help='the source file path')
parser.add_argument('--target_file_path', type=str, default=None, help='the target json file path')
parser.add_argument('--all_sample_num', type=int, default=None, help='the test sample number when convert_json_to_data')
parser.add_argument('--num_return_sequences', type=int, default=3, help='the number of return sequences for each input sample, it should be less than num_beams')
parser.add_argument('--batch_size', type=int, default=1, help='the batch size when using taskflow')
parser.add_argument('--position_prob', type=float, default=0.01, help='the batch size when using taskflow')
parser.add_argument('--decode_strategy', type=str, default=None, help='the decode strategy')
parser.add_argument('--num_beams', type=int, default=6, help='the number of beams when using beam search')
parser.add_argument('--num_beam_groups', type=int, default=1, help='the number of beam groups when using diverse beam search')
parser.add_argument('--diversity_rate', type=float, default=0.0, help='the diversity_rate when using diverse beam search')
parser.add_argument('--top_k', type=float, default=0, help='the top_k when using sampling decoding strategy')
parser.add_argument('--top_p', type=float, default=1.0, help='the top_p when using sampling decoding strategy')
parser.add_argument('--temperature', type=float, default=1.0, help='the temperature when using sampling decoding strategy')
args = parser.parse_args()
return args
# yapf: enable
def answer_generation_from_paragraphs(paragraphs, batch_size=16, model=None, wf=None):
"""Generate answer from given paragraphs."""
result = []
buffer = []
for paragraph_tobe in tqdm(paragraphs):
buffer.append(paragraph_tobe)
if len(buffer) == batch_size:
predicts = model(buffer)
paragraph_list = buffer
buffer = []
for predict_dict, paragraph in zip(predicts, paragraph_list):
if "答案" in predict_dict:
answer_dicts = predict_dict["答案"]
answers = [answer_dict["text"] for answer_dict in answer_dicts]
probabilitys = [answer_dict["probability"] for answer_dict in answer_dicts]
else:
answers = []
probabilitys = []
outdict = {
"context": paragraph,
"answer_candidates": sorted([(a, p) for a, p in zip(answers, probabilitys)], key=lambda x: -x[1]),
}
if wf:
wf.write(json.dumps(outdict, ensure_ascii=False) + "\n")
result.append(outdict)
return result
if __name__ == "__main__":
args = parse_args()
schema = ["答案"]
answer_generator = Taskflow(
"information_extraction",
schema=schema,
task_path=args.model_path,
batch_size=args.batch_size,
position_prob=args.position_prob,
)
assert args.source_file_path
paragraphs = []
if args.source_file_path.endswith(".json"):
with open(args.source_file_path, "r", encoding="utf-8") as rf:
for json_line in rf:
line_dict = json.loads(json_line)
assert "context" in line_dict or "content" in line_dict
if "context" in line_dict:
paragraphs.append(line_dict["context"].strip())
elif "content" in line_dict:
paragraphs.append(line_dict["content"].strip())
else:
with open(args.source_file_path, "r", encoding="utf-8") as rf:
for line in rf:
paragraphs.append(line.strip())
wf = None
if args.target_file_path:
wf = open(args.target_file_path, "w", encoding="utf-8")
answer_generation_from_paragraphs(paragraphs, batch_size=args.batch_size, model=answer_generator, wf=wf)
rf.close()
wf.close()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
from tqdm import tqdm
from paddlenlp import Taskflow
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--model_path', type=str, default=None, help='the model path to be loaded for question_generation taskflow')
parser.add_argument('--max_length', type=int, default=50, help='the max decoding length')
parser.add_argument('--num_return_sequences', type=int, default=3, help='the number of return sequences for each input sample, it should be less than num_beams')
parser.add_argument('--source_file_path', type=str, default=None, help='the souce json file path')
parser.add_argument('--target_file_path', type=str, default=None, help='the target json file path')
parser.add_argument('--all_sample_num', type=int, default=None, help='the test sample number when convert_json_to_data')
parser.add_argument('--batch_size', type=int, default=1, help='the batch size when using taskflow')
parser.add_argument('--decode_strategy', type=str, default=None, help='the decode strategy')
parser.add_argument('--num_beams', type=int, default=6, help='the number of beams when using beam search')
parser.add_argument('--num_beam_groups', type=int, default=1, help='the number of beam groups when using diverse beam search')
parser.add_argument('--diversity_rate', type=float, default=0.0, help='the diversity_rate when using diverse beam search')
parser.add_argument('--top_k', type=float, default=0, help='the top_k when using sampling decoding strategy')
parser.add_argument('--top_p', type=float, default=1.0, help='the top_p when using sampling decoding strategy')
parser.add_argument('--temperature', type=float, default=1.0, help='the temperature when using sampling decoding strategy')
args = parser.parse_args()
return args
# yapf: enable
def create_fake_question(json_file, out_json, num_return_sequences, all_sample_num=None, batch_size=8):
with open(json_file, "r", encoding="utf-8") as rf, open(out_json, "w", encoding="utf-8") as wf:
all_lines = rf.readlines()
num_all_lines = len(all_lines)
context_buffer = []
answer_buffer = []
true_question_buffer = []
for i, json_line in enumerate(tqdm(all_lines)):
line_dict = json.loads(json_line)
q = line_dict["question"]
a = line_dict["answer"]
c = line_dict["context"]
context_buffer += [c]
answer_buffer += [a]
true_question_buffer += [q]
if (
(i + 1) % batch_size == 0
or (all_sample_num and (i + 1) == all_sample_num or (i + 1))
or (i + 1) == num_all_lines
):
result_buffer = question_generation(
[{"context": context, "answer": answer} for context, answer in zip(context_buffer, answer_buffer)]
)
context_buffer_temp, answer_buffer_temp, true_question_buffer_temp = [], [], []
for context, answer, true_question in zip(context_buffer, answer_buffer, true_question_buffer):
context_buffer_temp += [context] * num_return_sequences
answer_buffer_temp += [answer] * num_return_sequences
true_question_buffer_temp += [true_question] * num_return_sequences
result_one_two_buffer = [(one, two) for one, two in zip(result_buffer[0], result_buffer[1])]
for context, answer, true_question, result in zip(
context_buffer_temp, answer_buffer_temp, true_question_buffer_temp, result_one_two_buffer
):
fake_quesitons_tokens = [result[0]]
fake_quesitons_scores = [result[1]]
for fake_quesitons_token, fake_quesitons_score in zip(
fake_quesitons_tokens, fake_quesitons_scores
):
out_dict = {
"context": context,
"answer": answer,
"question": fake_quesitons_token,
"true_question": true_question,
"score": fake_quesitons_score,
}
wf.write(json.dumps(out_dict, ensure_ascii=False) + "\n")
context_buffer = []
answer_buffer = []
true_question_buffer = []
if all_sample_num and (i + 1) >= all_sample_num:
break
if __name__ == "__main__":
args = parse_args()
question_generation = Taskflow(
"question_generation",
task_path=args.model_path,
output_scores=True,
max_length=args.max_length,
is_select_from_num_return_sequences=False,
num_return_sequences=args.num_return_sequences,
batch_size=args.batch_size,
decode_strategy=args.decode_strategy,
num_beams=args.num_beams,
num_beam_groups=args.num_beam_groups,
diversity_rate=args.diversity_rate,
top_k=args.top_k,
top_p=args.top_p,
temperature=args.temperature,
)
create_fake_question(
args.source_file_path, args.target_file_path, args.num_return_sequences, args.all_sample_num, args.batch_size
)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--do_create_test_qq_pair", action='store_true', help="Whether to do create_test_qq_pair")
parser.add_argument('--qq_pair_source_ori_file_path', type=str, default=None, help='the original source file path for qq-pair creating')
parser.add_argument('--qq_pair_source_trans_file_path', type=str, default=None, help='the translated source file path for qq-pair creating')
parser.add_argument('--qq_pair_target_file_path', type=str, default=None, help='the target file path for qq-pair creating')
parser.add_argument('--trans_query_answer_path', type=str, default=None, help='the target query-answer file path for extract_trans_from_fake_question')
parser.add_argument('--dev_sample_num', type=int, default=None, help='the test sample number when convert_json_to_data, if None, treat all lines as dev samples')
args = parser.parse_args()
return args
# yapf: enable
def extract_q_from_json_file(json_file, out_file=None, test_sample_num=None, query_answer_path=None):
with open(json_file, "r", encoding="utf-8") as rf:
if out_file:
wf = open(os.path.join(out_file), "w", encoding="utf-8")
if query_answer_path:
qeury_answer_wf = open(query_answer_path, "w", encoding="utf-8")
q_list = []
for i, json_line in enumerate(rf.readlines()):
line_dict = json.loads(json_line)
if isinstance(line_dict["question"], list):
question = line_dict["question"][0]
else:
question = line_dict["question"]
answer = line_dict["answer"]
if not test_sample_num or i < test_sample_num:
if query_answer_path:
qeury_answer_wf.write(
question.replace("\n", " ").replace("\t", " ").strip()
+ "\t"
+ answer.replace("\n", " ").replace("\t", " ").strip()
+ "\n"
)
if out_file:
wf.write(question.replace("\n", " ").replace("\t", " ").strip() + "\n")
q_list.append(question.strip())
else:
break
if query_answer_path:
qeury_answer_wf.close()
if out_file:
wf.colse()
return q_list
def create_test_qq_pair(
ori_path=None, trans_path=None, write_path=None, trans_query_answer_path=None, test_sample_num=None
):
assert trans_path
trans_rf = open(trans_path, "r", encoding="utf-8")
wf = open(write_path, "w", encoding="utf-8")
if trans_path.endswith(".json"):
trans_q_list = extract_q_from_json_file(trans_path, None, test_sample_num, trans_query_answer_path)
else:
trans_q_list = [
line.strip() for i, line in enumerate(trans_rf.readlines()) if not test_sample_num or i < test_sample_num
]
if not ori_path or ori_path in ["NONE", "None", "none"]:
origin_q_list = ["-" for _ in range(len(trans_q_list))]
else:
origin_rf = open(ori_path, "r", encoding="utf-8")
if ori_path.endswith(".json"):
origin_q_list = extract_q_from_json_file(ori_path, None, test_sample_num)
else:
origin_q_list = [
line.strip()
for i, line in enumerate(origin_rf.readlines())
if not test_sample_num or i < test_sample_num
]
for origin, trans in zip(origin_q_list, trans_q_list):
wf.write(
trans.replace("\n", " ").replace("\t", " ").strip()
+ "\t"
+ origin.replace("\n", " ").replace("\t", " ").strip()
+ "\n"
)
if not ori_path or ori_path in ["NONE", "None", "none"]:
pass
else:
origin_rf.close()
trans_rf.close()
wf.close()
if __name__ == "__main__":
args = parse_args()
if args.do_create_test_qq_pair:
create_test_qq_pair(
ori_path=args.qq_pair_source_ori_file_path,
trans_path=args.qq_pair_source_trans_file_path,
write_path=args.qq_pair_target_file_path,
trans_query_answer_path=args.trans_query_answer_path,
test_sample_num=args.dev_sample_num,
)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
def json_format_indent(json_file, output_json):
with open(output_json, "w", encoding="utf-8") as wf:
with open(json_file, "r", encoding="utf-8") as rf:
all_lines = []
for json_line in rf:
line_dict = json.loads(json_line)
all_lines.append(line_dict)
output_dataset = {"data": all_lines}
json.dump(output_dataset, wf, ensure_ascii=False, indent="\t")
if __name__ == "__main__":
json_format_indent("", "")
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import multiprocessing
import os
import time
from tqdm import tqdm
from tqdm.contrib import tzip
from paddlenlp.metrics import BLEU
from paddlenlp.transformers import BasicTokenizer
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--true_file_path', type=str, default=None, help='the source json file path')
parser.add_argument('--generate_file_path', type=str, default=None, help='the target json file path')
parser.add_argument('--num_return_sequences', type=int, default=3, help='the number of return sequences for each input sample, it should be less than num_beams')
parser.add_argument('--all_sample_num', type=int, default=None, help='the number of valid sample')
parser.add_argument('--bleu_n_size', type=int, default=4, help='the bleu n size')
parser.add_argument('--bleu_threshold', type=float, default=0.3, help='the bleu threshold')
parser.add_argument("--do_log_file", action="store_true", help="is log analysis file")
parser.add_argument('--log_dir', type=str, default=None, help='the log dir')
parser.add_argument("--do_multiprocessing", action="store_true", help="is do multiprocessing")
parser.add_argument("--do_map_async", action="store_true", help="is use map_async or apply_async when do multiprocessing")
args = parser.parse_args()
return args
# yapf: enable
def calc_bleu_n(preds, targets, n_size=4):
assert len(preds) == len(targets), (
"The length of pred_responses should be equal to the length of "
"target_responses. But received {} and {}.".format(len(preds), len(targets))
)
bleu = BLEU(n_size=n_size)
tokenizer = BasicTokenizer()
for pred, target in zip(preds, targets):
pred_tokens = tokenizer.tokenize(pred)
target_token = tokenizer.tokenize(target)
bleu.add_inst(pred_tokens, [target_token])
return bleu.score()
def worker_apply_async(true_question, generate_question_group, bleu_n_size, bleu_threshold, i):
first_positive_pair = None
for generate_question in generate_question_group:
bleu_score = calc_bleu_n([generate_question], [true_question], bleu_n_size)
if bleu_score > bleu_threshold:
first_positive_pair = (generate_question, true_question, i)
if first_positive_pair:
return (True, first_positive_pair)
else:
return (False, (generate_question_group[0], true_question))
def worker_map_async(args):
true_question, generate_question_group, bleu_n_size, bleu_threshold, i = args
first_positive_pair = None
for generate_question in generate_question_group:
bleu_score = calc_bleu_n([generate_question], [true_question], bleu_n_size)
if bleu_score > bleu_threshold:
first_positive_pair = (generate_question, true_question, i)
if first_positive_pair:
return (True, first_positive_pair)
else:
return (False, (generate_question_group[0], true_question))
def coverage_rate(
true_file_path,
generate_file_path,
bleu_n_size,
bleu_threshold,
num_return_sequences,
all_sample_num=None,
is_log_file=False,
log_dir=None,
is_multiprocessing=True,
is_map_async=True,
):
true_questions = []
with open(true_file_path, "r", encoding="utf-8") as rf:
for i, json_line in enumerate(tqdm(rf.readlines())):
if i >= all_sample_num:
break
line_dict = json.loads(json_line)
true_questions.append(
line_dict["question"][0] if isinstance(line_dict["question"], list) else line_dict["question"]
)
generate_question_groups = []
with open(generate_file_path, "r", encoding="utf-8") as rf:
group = []
for i, json_line in enumerate(tqdm(rf.readlines())):
if i >= all_sample_num * num_return_sequences:
break
line_dict = json.loads(json_line)
group.append(
line_dict["question"][0] if isinstance(line_dict["question"], list) else line_dict["question"]
)
if (i + 1) % num_return_sequences == 0:
generate_question_groups.append(group)
group = []
print("true_questions", len(true_questions))
print("generate_question_groups", len(generate_question_groups))
positive = []
negative = []
if is_multiprocessing:
pool = multiprocessing.Pool(processes=30)
pool_results = []
if is_map_async:
map_async_inputs = []
i = 0
bleu_cal_time_start = time.time()
generate_question_groups = [
[
generate_question if generate_question.strip() != "" else "none"
for generate_question in generate_question_group
]
for generate_question_group in generate_question_groups
]
for true_question, generate_question_group in tzip(true_questions, generate_question_groups):
if is_multiprocessing:
if is_map_async:
map_async_inputs.append((true_question, generate_question_group, bleu_n_size, bleu_threshold, i))
else:
pool_results.append(
pool.apply_async(
worker_apply_async,
args=(true_question, generate_question_group, bleu_n_size, bleu_threshold, i),
)
)
else:
first_positive_pair = None
best_pair, best_score = None, 0
for generate_question in generate_question_group:
try:
bleu_score = calc_bleu_n([generate_question], [true_question], bleu_n_size)
except BaseException:
print("generate_question", generate_question)
print("true_question", true_question)
if bleu_score > best_score:
best_pair = (generate_question, true_question)
if bleu_score > bleu_threshold:
first_positive_pair = (generate_question, true_question)
if first_positive_pair:
positive.append((best_pair[0], best_pair[1], best_score))
else:
negative.append((best_pair[0], best_pair[1], best_score))
i += 1
if is_multiprocessing:
if is_map_async:
pool_results = pool.map_async(worker_map_async, map_async_inputs)
pool.close()
pool.join()
for result in pool_results.get():
is_positive, pair = result
if is_positive:
positive.append(pair)
else:
negative.append(pair)
else:
pool.close()
pool.join()
for result in pool_results:
is_positive, pair = result.get()
if is_positive:
positive.append(pair)
else:
negative.append(pair)
bleu_cal_time_end = time.time()
print("bleu_cal_time_spend:", bleu_cal_time_end - bleu_cal_time_start)
if is_log_file and log_dir:
with open(os.path.join(log_dir, "positive_pair.txt"), "w", encoding="utf-8") as wf:
for pair in positive:
wf.write(
pair[0] + "\t" + pair[1] + "\n"
if len(pair) == 2
else pair[0] + "\t" + pair[1] + str(pair[2]) + "\n"
)
with open(os.path.join(log_dir, "negative_pair.txt"), "w", encoding="utf-8") as wf:
for pair in negative:
wf.write(
pair[0] + "\t" + pair[1] + "\n"
if len(pair) == 2
else pair[0] + "\t" + pair[1] + str(pair[2]) + "\n"
)
assert len(positive) + len(negative) == all_sample_num, (
"the number of positive pairs "
+ str(len(positive))
+ " plus the number of negative pairs "
+ str(len(negative))
+ " should be equal to all_sample_num"
+ str(all_sample_num)
)
return len(positive) / (len(positive) + len(negative))
if __name__ == "__main__":
args = parse_args()
rate = coverage_rate(
true_file_path=args.true_file_path,
generate_file_path=args.generate_file_path,
bleu_n_size=args.bleu_n_size,
bleu_threshold=args.bleu_threshold,
num_return_sequences=args.num_return_sequences,
all_sample_num=args.all_sample_num,
is_log_file=args.do_log_file,
log_dir=args.log_dir,
is_multiprocessing=args.do_multiprocessing,
is_map_async=args.do_map_async,
)
print("coverage rate is", rate)
# 评论观点抽取与情感倾向性分析
## 1. 场景概述
情感分析旨在对带有情感色彩的主观性文本进行分析、处理、归纳和推理,其广泛应用于消费决策、舆情分析、个性化推荐等领域,具有很高的商业价值。
依托百度领先的情感分析技术,食行生鲜自动生成菜品评论标签辅助用户购买,并指导运营采购部门调整选品和促销策略;房天下向购房者和开发商直观展示楼盘的用户口碑情况,并对好评楼盘置顶推荐;国美搭建服务智能化评分系统,客服运营成本减少40%,负面反馈处理率100%。
情感分析相关的任务有语句级情感分析、评论对象抽取、观点抽取等等。一般来讲,被人们所熟知的情感分析任务是语句级别的情感分析,该任务是在宏观上去分析整句话的感情色彩,其粒度可能相对比较粗。
因为在人们进行评论的时候,往往针对某一产品或服务进行多个属性的评论,对每个属性的评论可能也会褒贬不一,因此针对属性级别的情感分析在真实的场景中会更加实用,同时更能给到企业用户或商家更加具体的建议。例如这句关于薯片的评论。
> 这个薯片味道真的太好了,口感很脆,只是包装很一般。
可以看到,顾客在口感、包装和味道 三个属性上对薯片进行了评价,顾客在味道和口感两个方面给出了好评,但是在包装上给出了负面的评价。只有通过这种比较细粒度的分析,商家才能更有针对性的发现问题,进而改进自己的产品或服务。
基于这样的考虑,本项目提出了一种细粒度的情感分析能力,对于给定的文本,首先会抽取该文本中的评论观点,然后分析不同观点的情感极性。
## 2. 产品功能介绍
### 2.1 系统特色
为了降低技术门槛,方便开发者共享效果领先的情感分析技术,PaddleNLP本次开源的情感分析系统,具备三大亮点:
- 覆盖任务全
- 集成评论观点抽取、属性级情感分类等情感分析能力,并开源模型,且打通模型训练、评估、预测部署全流程。
- 效果领先
- 集成百度研发的基于情感知识增强的预训练模型SKEP,为各类情感分析任务提供统一且强大的情感语义表示能力。
- 预测性能强
- 针对预训练模型预测效率低的问题,开源小模型PP-MiniLM,量化优化策略,预测性能大幅提升。
### 2.2 架构&功能
本项目提出的情感分析解决方案如图1所示,整个情感分析的过程大致包含两个阶段,依次是评论观点抽取模型,属性级情感分类模型。对于给定的一段文本,首先基于前者抽取出文本语句中潜在的评论属性以及该属性相应的评论观点,然后将评论属性、观点以及原始文本进行拼接,传给属性级情感分类模型以识别出该评论属性的情感极性。
这里需要提到的是,由于目前市面上的大多数模型是基于通用语料训练出来的,这些模型可能并不会对情感信息那么敏感。基于这样的考量,本项目使用了百度自研的 SKEP 预训练模型,其在预训练阶段便设计了多种情感信息相关的预训练目标进行训练。作为一种情感专属的模型,其更适合用来做上边提到的评论观点抽取任务,以及属性级情感分类任务。
另外,本项目使用的是 Large 版的 SKEP 模型,考虑到企业用户在线上部署时会考虑到模型预测效率,所以本项目专门提供了一个通用版的小模型 [PP-MiniLM](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/model_compression/pp-minilm) 以及一套量化策略,用户可以使用相应情感数据集对 PP-MiniLM 进行微调,然后进行量化,以达到更快的预测效率。
<div align="center">
<img src="./imgs/sentiment_system.png" />
<p>图1 情感分析系统图<p/>
</div>
## 3. 情感分析效果展示
在图1中可以看到,本项目的核心模块为评论观点抽取和属性级情感分类模块,本项目中基于情感专属模型 SKEP 实现了两个模块,并且提供了两者训练和测试的脚本,分别放在 `extraction``classification` 目录下。
下表展示了我们训练的评论观点抽取模型在验证集 `dev` 和测试集 `test` 上的表现:
|Model|数据集|precision|Recall|F1|
| ------------ | ------------ | ------------ |-----------|------------ |
|SKEP-Large|dev|0.87095|0.90056|0.88551|
|SKEP-Large|test|0.87125|0.89944|0.88512|
下表展示了我们训练的属性级情感分类模型在验证集 `dev` 和测试集 `test` 上的表现:
|Model|数据集|precision|Recall|F1|
| ------------ | ------------ | ------------ |-----------|------------ |
|SKEP-Large|dev|0.98758|0.99251|0.99004|
|SKEP-Large|test|0.98497|0.99139|0.98817|
给定一段文本,使用我们提供的全流程预测脚本可以轻松获得情感分析结果,如下所示。
- input_text: 蛋糕味道不错,很好吃,店家很耐心,服务也很好,很棒
- aspect: 蛋糕味道, opinions: ['不错', '好吃'], sentiment_polarity: 正向
- aspect: 店家, opinions: ['耐心'], sentiment_polarity: 正向
- aspect: 服务, opinions: ['好', '棒'], sentiment_polarity: 正向
如果你想了解更多评论观点抽取模型和属性级情感分类模型的实现细节,请分别点击 [extraction](extraction/README.md)[classification](classification/README.md)
## 4. 情感分析实践
以下是本项目运行的完整目录结构以及说明:
```
.
├── extraction # 评价观点抽取模型包
├── classification # 细粒度情感分类模型包
├── pp_minilm # PP-MiniLM特色小模型包
├── deploy # 高性能预测部署包
│ ├── predict.py # 高性能预测脚本
│   ├── run_predict.py # 高性能预测命令
├── imgs # 图片目录
├── demo.py # demo脚本,方便体验预测效果
├── predict.py # 全流程预测脚本
├── export_model.py # 动转静模型导出脚本
├── utils.py # 工具函数脚本
├── run_demo.sh # 运行demo,快速体验情感分析效果
├── run_predict.sh # 全流程预测命令
├── run_export_model.sh # 动转静模型导出命令
└── README.md
```
### 4.1 运行环境和依赖安装
(1) 环境依赖
- python >= 3.6
- paddlenlp >= 2.2.2
- paddlepaddle-gpu >= 2.2.1
(2) 运行环境准备
在运行之前,请在本目录下新建目录 `data``checkpoints`,分别用于存放数据和保存模型。
本项目需要训练两个阶段的模型:评论观点抽取模型,属性级情感分类模型。本次针对这抽取和分类模型,我们分别开源了 Demo 数据: [ext_data](https://bj.bcebos.com/v1/paddlenlp/data/ext_data.tar.gz)[cls_data](https://bj.bcebos.com/v1/paddlenlp/data/cls_data.tar.gz)
用户可分别点击下载,解压后将相应的数据文件依次放入 `./data/ext_data``./data/cls_data` 目录下即可。
### 4.2 使用说明
本项目开源了训练后的评论观点模型 [ext_model](https://bj.bcebos.com/paddlenlp/models/best_ext.pdparams) 和 属性级情感分类模型 [cls_model](https://bj.bcebos.com/paddlenlp/models/best_cls.pdparams)。如有需要,可点击下载,下载后请将 `ext_model``cls_model` 重命名为 `best.pdparams`,分别放入 `./checkpoints/ext_checkpoints``./checkpoints/cls_checkpoints` 中。
另外,考虑到不同用户可能有不同的需求,本项目提供了如下的方式学习或使用本项目。
**(1)快速体验效果**
如果你想快速体验本项目提供的情感分析能力,可使用本项目提供的 `demo.sh` 脚本以交互式的方式进行体验。
```shell
sh run_demo.sh
```
**备注**:体验之前,请确保下载以上提到的 `ext_model``cls_model`,重命名后放入相应的目录中。
**(2) 文本批量预测**
如果你有一批数据,不方便逐句输入,可使用本项目提供的正式预测脚本 `predict.py`, 以文件的形式进行输入,处理后该脚本会将结果文件保存到与输入文件相同的目录下,默认的结果文件名为 `sentiment_results.json`
本功能在预测时需要传入测试集文件路径,可将测试集文件命名为`test.txt`, 然后放入 `./data` 目录下。需要注意的是,测试集文件每行均为一个待预测的语句,如下所示。
- 蛋糕味道不错,很好吃,店家很耐心,服务也很好,很棒
- 酒店干净整洁,性价比很高
- 酒店环境不错,非常安静,性价比还可以
- 房间很大,环境不错
通过运行如下命令,便可进行批量文本情感分析预测:
```shell
sh run_predict.sh
```
**备注**:体验之前,请确保下载以上提到的 `ext_model``cls_model`,重命名后放入相应的目录中。
**(3)高性能预测**
如果你想将本项目部署到线上环境去运行,那么建议你使用本项目基于 Paddle Inference 实现的高性能推理脚本 `deploy/predict.py`
在使用之前,首先需要将保存的动态图模型转为静态图,通过调用下面的命令,便可将评论观点抽取模型和属性级情感分类模型转为静态图模型:
```shell
sh run_export_model.sh extraction
sh run_export_model.sh classification
```
这里需要注意的是,要确保相应的动态图已经下载或者训练生成到 `model_path` 指定的目录中,静态图模型会自动生成到`save_path`指定的地址。
同上,高性能预测的默认输入和输出形式也为文件,可分别通过 `test_path``save_path` 进行指定,通过如下命令便可以基于Paddle Inference 进行高性能预测:
```shell
cd deploy
sh run_predict.sh
```
**(4)自定义模型训练**
如果你希望自己尝试进行评论观点抽取模型训练,可使用4.1节中提供的 `ext_data` Demo 数据,或自己业务的标注数据重新训练模型,本项目已将评论观点抽取模型的相关训练和测试代码放入 `extraction` 目录下, 请到该目录下执行模型训练即可,更多的实现细节和使用方式,请参考[这里](extraction/README.md)
如果你希望自己尝试进行属性级情感分类模型训练,可使用4.1节中提供的 `cls_data` Demo 数据,或自己业务的标注数据重新训练模型,本项目已将属性级情感分类模型的相关训练和测试代码放入 `classification` 目录下,请到该目录下执行模型训练即可,更多的实现细节和使用方式,请参考[这里](classification/README.md)
在训练后,如果需要进行高性能预测,可参考(3)进行动转静,然后基于Paddle Inference 进行高性能预测。
### 4.3 数据标注说明
如果你想标注自己的业务数据,并尝试利用标注的新数据重新训练本项目。本项目推荐使用 [doccano](https://github.com/doccano/doccano) 进行数据标注平台,同时本项目也打通了其从标注到训练的通道,即 doccano 导出的数据后可通过 [doccano.py](./doccano.py) 脚本轻松将数据转换为输入模型时需要的形式,实现无缝衔接。 为达到这个目的,您需要按以下标注规则在 doccano 平台上标注数据:
<div align="center">
<img src="./imgs/labeling_example.png" />
<p>图2 数据标注样例图<p/>
</div>
- 在doccano平台上,定义标签 Pos-Aspect、 Neg-Aspect 和 Opinion,其中 Pos-Aspect 表示 Aspect 的情感极性为正向;Neg-Aspect 表示 Aspect 的情感极性为负向;Opinion 表示相应的观点词。
- 使用以上定义的标签开始标注数据,图2展示了一个标注样例。
- 当标注完成后,在 doccano 平台上导出 `jsonl` 形式的文件,并将其重命名为 `doccano.json` 后,放入 `./data` 目录下。
- 通过 [doccano.py](./doccano.py) 脚本进行数据形式转换,然后便可以开始进行相应模型训练。
```shell
python doccano.py \
--doccano_file ./data/doccano.json \
--save_ext_dir ./data/ext_data \
--save_cls_dir ./data/cls_data
```
**备注:**
- 默认情况下 [doccano.py](./doccano.py) 脚本会按照比例将数据划分为 train/dev/test 数据集
- 每次执行 [doccano.py](./doccano.py) 脚本,将会覆盖已有的同名数据文件
## 5. 小模型优化策略
以上实验中,无论是评论观点抽取模型,还是属性级情感分类模型,使用的均是 Large 版的 SKEP 模型,考虑到企业用户在线上部署时会考虑到模型预测效率,本项目提供了一套基于 [PP-MiniLM](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/model_compression/pp-minilm) 中文特色小模型的解决方案。PP-MiniLM 提供了一套完整的小模型优化方案:首先使用 Task-agnostic 的方式进行模型蒸馏、然后依托于 [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 进行模型裁剪、模型量化等模型压缩技术,有效减小了模型的规模,加快了模型运行速度。
本项目基于 PP-MiniLM 中文特色小模型进行 fine-tune 属性级情感分类模型,然后使用 PaddleSlim 对训练好的模型进行量化操作。
在实验进行后,我们将 SKEP-Large、PP-MiniLM、量化PP-MiniLM 三个模型在性能和效果方面进行了对比,如下表所示。可以看到,三者在本任务数据集上的评估指标几乎相等,但是 PP-MiniLM 小模型运行速度较 SKEP-Large 提高了4倍,量化后的 PP-MiniLM 运行速度较 SKEP-Large 提高了近8倍。更多的详细信息请参考[这里](./pp_minilm/README.md)
|Model|运行时间(s)|precision|Recall|F1|
| ------------ | ------------ | ------------ |-----------|------------ |
|SKEP-Large|1.00x|0.98497|0.99139|0.98817|
|PP-MiniLM|4.95x|0.98379|0.98859|0.98618|
|量化 PP-MiniLM|8.93x|0.98312|0.98953|0.98631|
## 6. 引用
[1] H. Tian et al., “SKEP: Sentiment Knowledge Enhanced Pre-training for Sentiment Analysis,” arXiv:2005.05635 [cs], May 2020, Accessed: Nov. 11, 2021.
# 细粒度情感分类模型
## 1. 方案设计
本项目将进行属性级别的情感分类,对于给定的一段文本,我们在基于评论观点抽取模型抽取出不同属性对应的观点后,便可以有针对性地对各个属性判别情感极性。具体来讲,本项目将抽取出的评论属性和观点进行拼接,然后和原始语句进行拼接作为一条独立的训练语句。
如图1所示,首先将评论属性和观点词进行拼接为"味道好",然后将"味道好"和原文进行拼接,然后传入SKEP模型,并使用 "CLS" 位置的向量进行细粒度情感倾向。
<div align="center">
<img src="../imgs/design_cls_model.png" />
<p>图1 细粒度情感分类模型<p/>
</div>
## 2. 项目结构说明
以下是本项目运行的完整目录结构及说明:
```shell
.
├── data.py # 数据处理脚本
├── model.py # 模型组网脚本
├── train.py # 模型训练脚本
├── evaluate.py # 模型评估脚本
├── run_train.sh # 模型训练命令
├── run_evaluate.sh # 模型评估命令
└── README.md
```
## 3. 数据说明
本实验中,相应的数据集需要包含3列数据:标签、评论观点和原文,下面给出了一些样本示例。
- 1 口味清淡 口味很清淡,价格也比较公道
- 1 经济实惠 经济实惠,环境好,套餐划算
- 0 设施一般 房间大,设施一般
可点击 [cls_data](https://bj.bcebos.com/v1/paddlenlp/data/cls_data.tar.gz) 进行 Demo 数据下载,将数据解压之后放入父目录的 `data/cls_data/` 文件夹下。
## 4. 模型效果展示
在分类模型训练过程中,总共训练了10轮,并选择了评估 F1 得分最高的 best 模型,下表展示了训练过程中使用的训练参数。我们同时开源了相应的模型,可点击下表的 `cls_model` 进行下载,下载后将模型重命名为 `best.pdparams`,然后放入父目录的 `checkpoints/cls_checkpoints` 中。
|Model|训练参数配置|MD5|
| ------------ | ------------ |-----------|
|[cls_model](https://bj.bcebos.com/paddlenlp/models/best_cls.pdparams)|<div style="width: 150pt"> learning_rate: 3e-5, batch_size: 16, max_seq_len:256, epochs:10 </div>|3de6ddf581e665d9b1d035c29b49778a|
我们基于训练过程中的 best 模型在验证集 `dev` 和测试集 `test` 上进行了评估测试,模型效果如下表所示:
|Model|数据集|precision|Recall|F1|
| ------------ | ------------ | ------------ |-----------|------------ |
|SKEP-Large|dev|0.98758|0.99251|0.99004|
|SKEP-Large|test|0.98497|0.99139|0.98817|
**备注**: 以上数据是基于全量数据训练和测试结果,并非 Demo 数据集。
## 5. 模型训练
通过运行以下命令进行分类模型训练:
```shell
sh run_train.sh
```
## 6. 模型测试
通过运行以下命令进行分类模型测试:
```shell
sh run_evaluate.sh
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def load_dict(dict_path):
with open(dict_path, "r", encoding="utf-8") as f:
words = [word.strip() for word in f.readlines()]
word2id = dict(zip(words, range(len(words))))
id2word = dict((v, k) for k, v in word2id.items())
return word2id, id2word
def convert_example_to_feature(example, tokenizer, label2id, max_seq_len=512, is_test=False):
example = example["text"].rstrip().split("\t")
if not is_test:
label = int(example[0])
aspect_text = example[1]
text = example[2]
encoded_inputs = tokenizer(aspect_text, text_pair=text, max_seq_len=max_seq_len, return_length=True)
encoded_inputs["label"] = label
else:
aspect_text = example[0]
text = example[1]
encoded_inputs = tokenizer(aspect_text, text_pair=text, max_seq_len=max_seq_len, return_length=True)
return encoded_inputs
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from functools import partial
import paddle
from data import convert_example_to_feature, load_dict
from datasets import load_dataset
from tqdm import tqdm
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.metrics.glue import AccuracyAndF1
from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer
def evaluate(model, data_loader, metric):
model.eval()
metric.reset()
for batch_data in tqdm(data_loader):
input_ids, token_type_ids, labels = batch_data["input_ids"], batch_data["token_type_ids"], batch_data["labels"]
logits = model(input_ids, token_type_ids=token_type_ids)
correct = metric.compute(logits, labels)
metric.update(correct)
accuracy, precision, recall, f1, _ = metric.accumulate()
return accuracy, precision, recall, f1
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default=None, help="The path of saved model that you want to load.")
parser.add_argument('--test_path', type=str, default=None, help="The path of test set.")
parser.add_argument("--label_path", type=str, default=None, help="The path of label dict.")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size per GPU/CPU for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.")
args = parser.parse_args()
# yapf: enbale
# load dev data
model_name = "skep_ernie_1.0_large_ch"
label2id, id2label = load_dict(args.label_path)
datasets = load_dataset("text", data_files={"test": args.test_path})
tokenizer = SkepTokenizer.from_pretrained(model_name)
trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len)
test_ds = datasets["test"].map(trans_func, batched=False, remove_columns=["text"])
data_collator = DataCollatorWithPadding(tokenizer)
test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False)
test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=data_collator)
# load model
loaded_state_dict = paddle.load(args.model_path)
model = SkepForSequenceClassification.from_pretrained(model_name, num_classes=len(label2id))
model.load_dict(loaded_state_dict)
metric = AccuracyAndF1()
# evaluate on dev data
accuracy, precision, recall, f1 = evaluate(model, test_loader, metric)
print(f'evaluation result: accuracy:{accuracy:.5f} precision: {precision:.5f}, recall: {recall:.5f}, F1: {f1:.5f}')
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export CUDA_VISIBLE_DEVICES=0
python evaluate.py \
--model_path "../checkpoints/cls_checkpoints/best.pdparams" \
--test_path "../data/cls_data/test.txt" \
--label_path "../data/cls_data/label.dict" \
--batch_size 16 \
--max_seq_len 256
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export CUDA_VISIBLE_DEVICES=0
python train.py \
--train_path "../data/cls_data/train.txt" \
--dev_path "../data/cls_data/dev.txt" \
--label_path "../data/cls_data/label.dict" \
--num_epochs 5 \
--batch_size 16 \
--max_seq_len 256 \
--learning_rate 3e-5 \
--weight_decay 0.01 \
--max_grad_norm 1.0 \
--warmup_proportion 0.1 \
--log_steps 50 \
--eval_steps 100 \
--seed 1000 \
--device "gpu" \
--checkpoints "../checkpoints/cls_checkpoints"
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import random
import warnings
from functools import partial
import numpy as np
import paddle
from data import convert_example_to_feature, load_dict
from datasets import load_dataset
from evaluate import evaluate
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.metrics.glue import AccuracyAndF1
from paddlenlp.transformers import (
LinearDecayWithWarmup,
SkepForSequenceClassification,
SkepTokenizer,
)
warnings.filterwarnings("ignore")
def set_seed(seed):
paddle.seed(seed)
random.seed(seed)
np.random.seed(seed)
def train():
# set running envir
model_name = "skep_ernie_1.0_large_ch"
paddle.set_device(args.device)
set_seed(args.seed)
if not os.path.exists(args.checkpoints):
os.mkdir(args.checkpoints)
# load and process data5
label2id, id2label = load_dict(args.label_path)
datasets = load_dataset("text", data_files={"train": args.train_path, "dev": args.dev_path})
tokenizer = SkepTokenizer.from_pretrained(model_name)
trans_func = partial(
convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len
)
train_ds = datasets["train"].map(trans_func, batched=False, remove_columns=["text"])
dev_ds = datasets["dev"].map(trans_func, batched=False, remove_columns=["text"])
data_collator = DataCollatorWithPadding(tokenizer, padding=True)
train_batch_sampler = paddle.io.BatchSampler(train_ds, batch_size=args.batch_size, shuffle=True)
dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False)
train_loader = paddle.io.DataLoader(train_ds, batch_sampler=train_batch_sampler, collate_fn=data_collator)
dev_loader = paddle.io.DataLoader(dev_ds, batch_sampler=dev_batch_sampler, collate_fn=data_collator)
# configure model training
model = SkepForSequenceClassification.from_pretrained(model_name, num_classes=len(label2id))
num_training_steps = len(train_loader) * args.num_epochs
lr_scheduler = LinearDecayWithWarmup(
learning_rate=args.learning_rate, total_steps=num_training_steps, warmup=args.warmup_proportion
)
decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params,
grad_clip=grad_clip,
)
metric = AccuracyAndF1()
# start to train model
global_step, best_f1 = 1, 0.0
model.train()
for epoch in range(1, args.num_epochs + 1):
for batch_data in train_loader():
input_ids, token_type_ids, labels = (
batch_data["input_ids"],
batch_data["token_type_ids"],
batch_data["labels"],
)
loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=labels)
loss.backward()
lr_scheduler.step()
optimizer.step()
optimizer.clear_grad()
if global_step > 0 and global_step % args.log_steps == 0:
print(f"epoch: {epoch} - global_step: {global_step}/{num_training_steps} - loss:{loss.item():.6f}")
if (global_step > 0 and global_step % args.eval_steps == 0) or global_step == num_training_steps:
accuracy, precision, recall, f1 = evaluate(model, dev_loader, metric)
model.train()
if f1 > best_f1:
print(f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}")
best_f1 = f1
paddle.save(model.state_dict(), f"{args.checkpoints}/best.pdparams")
print(
f"evaluation result: accuracy:{accuracy:.5f} precision: {precision:.5f}, recall: {recall:.5f}, F1: {f1:.5f}"
)
global_step += 1
paddle.save(model.state_dict(), f"{args.checkpoints}/final.pdparams")
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epochs", type=int, default=3, help="Number of epoches for training.")
parser.add_argument("--train_path", type=str, default=None, help="The path of train set.")
parser.add_argument("--dev_path", type=str, default=None, help="The path of dev set.")
parser.add_argument("--label_path", type=str, default=None, help="The path of label dict.")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size per GPU/CPU for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="The initial learning rate for optimizer.")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max grad norm to clip gradient.")
parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Linear warmup proportion over the training process.")
parser.add_argument("--log_steps", type=int, default=50, help="Frequency of printing log.")
parser.add_argument("--eval_steps", type=int, default=500, help="Frequency of performing evaluation.")
parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--checkpoints", type=str, default=None, help="Directory to save checkpoint.")
args = parser.parse_args()
# yapf: enable
train()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
import paddle
from utils import decoding, load_dict
from paddlenlp.transformers import (
SkepForSequenceClassification,
SkepForTokenClassification,
SkepTokenizer,
)
def is_aspect_first(text, aspect, opinion_word):
return text.find(aspect) <= text.find(opinion_word)
def concate_aspect_and_opinion(text, aspect, opinion_words):
aspect_text = ""
for opinion_word in opinion_words:
if is_aspect_first(text, aspect, opinion_word):
aspect_text += aspect + opinion_word + ","
else:
aspect_text += opinion_word + aspect + ","
aspect_text = aspect_text[:-1]
return aspect_text
def format_print(results):
for result in results:
aspect, opinions, sentiment = result["aspect"], result["opinions"], result["sentiment_polarity"]
print(f"aspect: {aspect}, opinions: {opinions}, sentiment_polarity: {sentiment}")
print()
def predict(args, ext_model, cls_model, tokenizer, ext_id2label, cls_id2label):
ext_model.eval()
cls_model.eval()
while True:
input_text = input("input text: \n")
input_text = re.sub(" +", "", input_text.strip())
if not input_text:
continue
if input_text == "quit" or input_text == "exit":
break
input_text = input_text.strip().replace(" ", "")
# processing input text
encoded_inputs = tokenizer(list(input_text), is_split_into_words=True, max_seq_len=args.ext_max_seq_len)
input_ids = paddle.to_tensor([encoded_inputs["input_ids"]])
token_type_ids = paddle.to_tensor([encoded_inputs["token_type_ids"]])
# extract aspect and opinion words
logits = ext_model(input_ids, token_type_ids=token_type_ids)
predictions = logits.argmax(axis=2).numpy()[0]
tag_seq = [ext_id2label[idx] for idx in predictions][1:-1]
aps = decoding(input_text[: args.ext_max_seq_len - 2], tag_seq)
# predict sentiment for aspect with cls_model
results = []
for ap in aps:
aspect = ap[0]
opinion_words = list(set(ap[1:]))
aspect_text = concate_aspect_and_opinion(input_text, aspect, opinion_words)
encoded_inputs = tokenizer(
aspect_text, text_pair=input_text, max_seq_len=args.cls_max_seq_len, return_length=True
)
input_ids = paddle.to_tensor([encoded_inputs["input_ids"]])
token_type_ids = paddle.to_tensor([encoded_inputs["token_type_ids"]])
logits = cls_model(input_ids, token_type_ids=token_type_ids)
prediction = int(logits.argmax(axis=1))
result = {"aspect": aspect, "opinions": opinion_words, "sentiment_polarity": cls_id2label[prediction]}
results.append(result)
format_print(results)
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--ext_model_path", type=str, default=None, help="The path of extraction model path that you want to load.")
parser.add_argument("--cls_model_path", type=str, default=None, help="The path of classification model path that you want to load.")
parser.add_argument("--ext_label_path", type=str, default=None, help="The path of extraction label dict.")
parser.add_argument("--cls_label_path", type=str, default=None, help="The path of classification label dict.")
parser.add_argument("--ext_max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization for extraction model.")
parser.add_argument("--cls_max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization for classification model.")
args = parser.parse_args()
# yapf: enbale
# load dict
model_name = "skep_ernie_1.0_large_ch"
ext_label2id, ext_id2label = load_dict(args.ext_label_path)
cls_label2id, cls_id2label = load_dict(args.cls_label_path)
tokenizer = SkepTokenizer.from_pretrained(model_name)
print("label dict loaded.")
# load ext model
ext_state_dict = paddle.load(args.ext_model_path)
ext_model = SkepForTokenClassification.from_pretrained(model_name, num_classes=len(ext_label2id))
ext_model.load_dict(ext_state_dict)
print("extraction model loaded.")
# load cls model
cls_state_dict = paddle.load(args.cls_model_path)
cls_model = SkepForSequenceClassification.from_pretrained(model_name, num_classes=len(cls_label2id))
cls_model.load_dict(cls_state_dict)
print("classification model loaded.")
# do predict
predict(args, ext_model, cls_model, tokenizer, ext_id2label, cls_id2label)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment