Unverified Commit 4dd9a3fc authored by Leymore's avatar Leymore Committed by GitHub
Browse files

[Sync] sync with internal codes 20231019 (#488)

parent 2737249f
...@@ -3,7 +3,9 @@ exclude: | ...@@ -3,7 +3,9 @@ exclude: |
tests/data/| tests/data/|
opencompass/models/internal/| opencompass/models/internal/|
opencompass/utils/internal/| opencompass/utils/internal/|
opencompass/openicl/icl_evaluator/hf_metrics/ opencompass/openicl/icl_evaluator/hf_metrics/|
opencompass/datasets/lawbench/utils|
opencompass/datasets/lawbench/evaluation_functions/
) )
repos: repos:
- repo: https://gitee.com/openmmlab/mirrors-flake8 - repo: https://gitee.com/openmmlab/mirrors-flake8
......
from ..utils.function_utils import compute_rouge from ..utils.function_utils import compute_rouge
#情景法条识别 #情景法条识别
def compute_cjft(data_dict): def compute_cjft(data_dict):
""" """
Compute the ROUGE-L score between the prediction and the reference Compute the ROUGE-L score between the prediction and the reference
""" """
references, predictions = [], [] references, predictions = [], []
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction) predictions.append(prediction)
references.append(answer) references.append(answer)
# compute the accuracy of score_list # compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references) rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls) average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l} return {"score": average_rouge_l}
from ..utils.function_utils import compute_rouge from ..utils.function_utils import compute_rouge
#法律咨询 #法律咨询
def compute_flzx(data_dict): def compute_flzx(data_dict):
""" """
Compute the ROUGE-L score between the prediction and the reference Compute the ROUGE-L score between the prediction and the reference
""" """
references, predictions = [], [] references, predictions = [], []
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction) predictions.append(prediction)
references.append(answer) references.append(answer)
# compute the accuracy of score_list # compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references) rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls) average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l} return {"score": average_rouge_l}
from ..utils.function_utils import compute_rouge from ..utils.function_utils import compute_rouge
#法条记忆问答 #法条记忆问答
def compute_ftcs(data_dict): def compute_ftcs(data_dict):
""" """
Compute the ROUGE-L score between the prediction and the reference Compute the ROUGE-L score between the prediction and the reference
""" """
references, predictions = [], [] references, predictions = [], []
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
answer = answer.replace("答案:", "") answer = answer.replace("答案:", "")
predictions.append(prediction) predictions.append(prediction)
references.append(answer) references.append(answer)
# compute the accuracy of score_list # compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references) rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls) average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l} return {"score": average_rouge_l}
from ..utils.function_utils import multi_choice_judge from ..utils.function_utils import multi_choice_judge
""" """
multi-choice single-label selection multi-choice single-label selection
metric: accuracy metric: accuracy
争议焦点:识别案件涉及的争议焦点 争议焦点:识别案件涉及的争议焦点
""" """
def compute_jdzy(data_dict): def compute_jdzy(data_dict):
""" """
Compute the Accuracy Compute the Accuracy
The JEC dataset has 16 possible answers for each question, stored in the option_list The JEC dataset has 16 possible answers for each question, stored in the option_list
A prediction is correct if A prediction is correct if
1. The correct answer appears in the prediction, and 1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction. 2. Options other than the answer do not appear in the prediction.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理", option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理",
"原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"] "原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"]
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
if answer[7:-1] == "赔偿": if answer[7:-1] == "赔偿":
# todo: dataset imperfection # todo: dataset imperfection
continue continue
assert answer.startswith("争议焦点类别:") and answer[7:-1] in option_list, \ assert answer.startswith("争议焦点类别:") and answer[7:-1] in option_list, \
f"answer: {answer} \n question: {question}" f"answer: {answer} \n question: {question}"
answer_letter = answer[7:-1] answer_letter = answer[7:-1]
judge = multi_choice_judge(prediction, option_list, answer_letter) judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"]) score_list.append(judge["score"])
abstentions += judge["abstention"] abstentions += judge["abstention"]
# compute the accuracy of score_list # compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list) accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
from ..utils.function_utils import multi_choice_judge from ..utils.function_utils import multi_choice_judge
""" """
Task: multi-choice selection Task: multi-choice selection
Metric: Accuracy Metric: Accuracy
司法考试-案例分析 司法考试-案例分析
""" """
def compute_jec_ac(data_dict): def compute_jec_ac(data_dict):
""" """
Compute the Accuracy Compute the Accuracy
The JEC dataset has 4 options for each question: A, B, C, D The JEC dataset has 4 options for each question: A, B, C, D
A prediction is correct if A prediction is correct if
1. The correct answer appears in the prediction, and 1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction. 2. Options other than the answer do not appear in the prediction.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
option_list = ["A", "B", "C", "D"] option_list = ["A", "B", "C", "D"]
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}" assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
answer_letter = answer[5] answer_letter = answer[5]
judge = multi_choice_judge(prediction, option_list, answer_letter) judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"]) score_list.append(judge["score"])
abstentions += judge["abstention"] abstentions += judge["abstention"]
# compute the accuracy of score_list # compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list) accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
from ..utils.function_utils import multi_choice_judge from ..utils.function_utils import multi_choice_judge
""" """
Task: multi-choice selection Task: multi-choice selection
Metric: Accuracy Metric: Accuracy
司法考试 司法考试
""" """
def compute_jec_kd(data_dict): def compute_jec_kd(data_dict):
""" """
Compute the Accuracy Compute the Accuracy
The JEC_KD dataset has 4 options for each question: A, B, C, D The JEC_KD dataset has 4 options for each question: A, B, C, D
A prediction is correct if A prediction is correct if
1. The correct answer appears in the prediction, and 1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction. 2. Options other than the answer do not appear in the prediction.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
option_list = ["A", "B", "C", "D"] option_list = ["A", "B", "C", "D"]
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}" assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
answer_letter = answer[5] answer_letter = answer[5]
judge = multi_choice_judge(prediction, option_list, answer_letter) judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"]) score_list.append(judge["score"])
abstentions += judge["abstention"] abstentions += judge["abstention"]
# compute the accuracy of score_list # compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list) accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
import re import re
""" """
number prediction number prediction
metric: accuracy metric: accuracy
金额提取 金额提取
""" """
def compute_jetq(data_dict): def compute_jetq(data_dict):
""" """
Compute the Accuracy Compute the Accuracy
we extract the total amount of cost involved in the crime from the prediction and compare it with the reference we extract the total amount of cost involved in the crime from the prediction and compare it with the reference
The prediction is correct if The prediction is correct if
the total amount of cost provided in the reference, appears in the prediction. the total amount of cost provided in the reference, appears in the prediction.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("上文涉及到的犯罪金额:"), f"answer: {answer}, question: {question}" assert answer.startswith("上文涉及到的犯罪金额:"), f"answer: {answer}, question: {question}"
assert answer.endswith("元。"), f"answer: {answer}, question: {question}" assert answer.endswith("元。"), f"answer: {answer}, question: {question}"
answer = answer.replace("上文涉及到的犯罪金额:", "") answer = answer.replace("上文涉及到的犯罪金额:", "")
assert "千元" not in answer, f"answer: {answer}, question: {question}" assert "千元" not in answer, f"answer: {answer}, question: {question}"
assert "万" not in answer, f"answer: {answer}, question: {question}" assert "万" not in answer, f"answer: {answer}, question: {question}"
# remove "元" # remove "元"
answer = answer.replace("元。", "") answer = answer.replace("元。", "")
answer = float(answer) answer = float(answer)
prediction_digits = re.findall(r"\d+\.?\d*", prediction) prediction_digits = re.findall(r"\d+\.?\d*", prediction)
prediction_digits = [float(digit) for digit in prediction_digits] prediction_digits = [float(digit) for digit in prediction_digits]
if len(prediction_digits) == 0: if len(prediction_digits) == 0:
abstentions += 1 abstentions += 1
if answer in prediction_digits: if answer in prediction_digits:
score_list.append(1) score_list.append(1)
else: else:
score_list.append(0) score_list.append(0)
# compute the accuracy of score_list # compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list) accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions/len(data_dict)} return {"score": accuracy, "abstention_rate": abstentions/len(data_dict)}
from ..utils.function_utils import multi_choice_judge from ..utils.function_utils import multi_choice_judge
""" """
Task: multi-choice selection Task: multi-choice selection
Metric: Accuracy Metric: Accuracy
论辩挖掘 论辩挖掘
""" """
def compute_lblj(data_dict): def compute_lblj(data_dict):
""" """
Compute the Accuracy Compute the Accuracy
The LBLJ dataset has 5 options for each question: A, B, C, D, E The LBLJ dataset has 5 options for each question: A, B, C, D, E
A prediction is correct if A prediction is correct if
1. The correct answer appears in the prediction, and 1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction. 2. Options other than the answer do not appear in the prediction.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
option_list = ["A", "B", "C", "D", "E"] option_list = ["A", "B", "C", "D", "E"]
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("[正确答案]") and answer[6] in option_list, f"answer[6]: {answer}, question: {question}" assert answer.startswith("[正确答案]") and answer[6] in option_list, f"answer[6]: {answer}, question: {question}"
answer_letter = answer[6] answer_letter = answer[6]
judge = multi_choice_judge(prediction, option_list, answer_letter) judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"]) score_list.append(judge["score"])
abstentions += judge["abstention"] abstentions += judge["abstention"]
# compute the accuracy of score_list # compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list) accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
from ..utils.function_utils import compute_f1_two_sets from ..utils.function_utils import compute_f1_two_sets
""" """
task: legal accusation prediction task: legal accusation prediction
metric: f1 score metric: f1 score
法律判决预测-罪名预测 法律判决预测-罪名预测
""" """
option_list = ["侮辱", "违法发放贷款", "失火", "票据诈骗", "帮助犯罪分子逃避处罚", "重大责任事故", "对非国家工作人员行贿", option_list = ["侮辱", "违法发放贷款", "失火", "票据诈骗", "帮助犯罪分子逃避处罚", "重大责任事故", "对非国家工作人员行贿",
"非法制造、销售非法制造的注册商标标识", "非法制造、买卖、运输、邮寄、储存枪支、弹药、爆炸物", "非法获取公民个人信息", "非法制造、销售非法制造的注册商标标识", "非法制造、买卖、运输、邮寄、储存枪支、弹药、爆炸物", "非法获取公民个人信息",
"扰乱无线电通讯管理秩序", "非法持有、私藏枪支、弹药", "拒不执行判决、裁定", "虚开发票", "巨额财产来源不明", "扰乱无线电通讯管理秩序", "非法持有、私藏枪支、弹药", "拒不执行判决、裁定", "虚开发票", "巨额财产来源不明",
"组织、领导、参加黑社会性质组织", "非法获取国家秘密", "以危险方法危害公共安全", "非法持有毒品", "组织、领导、参加黑社会性质组织", "非法获取国家秘密", "以危险方法危害公共安全", "非法持有毒品",
"聚众扰乱公共场所秩序、交通秩序", "包庇毒品犯罪分子", "滥伐林木", "伪造公司、企业、事业单位、人民团体印章", "聚众扰乱公共场所秩序、交通秩序", "包庇毒品犯罪分子", "滥伐林木", "伪造公司、企业、事业单位、人民团体印章",
"非法占用农用地", "走私废物", "串通投标", "非法采伐、毁坏国家重点保护植物", "冒充军人招摇撞骗", "玩忽职守", "非法占用农用地", "走私废物", "串通投标", "非法采伐、毁坏国家重点保护植物", "冒充军人招摇撞骗", "玩忽职守",
"重婚", "招收公务员、学生徇私舞弊", "组织、领导传销活动", "非法猎捕、杀害珍贵、濒危野生动物", "侵犯著作权", "重婚", "招收公务员、学生徇私舞弊", "组织、领导传销活动", "非法猎捕、杀害珍贵、濒危野生动物", "侵犯著作权",
"非法种植毒品原植物", "伪造、变造、买卖武装部队公文、证件、印章", "倒卖文物", "伪造、变造居民身份证", "滥用职权", "非法种植毒品原植物", "伪造、变造、买卖武装部队公文、证件、印章", "倒卖文物", "伪造、变造居民身份证", "滥用职权",
"诽谤", "猥亵儿童", "非法转让、倒卖土地使用权", "挪用公款", "污染环境", "出售、购买、运输假币", "敲诈勒索", "诽谤", "猥亵儿童", "非法转让、倒卖土地使用权", "挪用公款", "污染环境", "出售、购买、运输假币", "敲诈勒索",
"高利转贷", "故意伤害", "持有、使用假币", "单位受贿", "强奸", "引诱、容留、介绍卖淫", "虐待", "高利转贷", "故意伤害", "持有、使用假币", "单位受贿", "强奸", "引诱、容留、介绍卖淫", "虐待",
"生产、销售伪劣农药、兽药、化肥、种子", "妨害公务", "容留他人吸毒", "拐骗儿童", "强制猥亵、侮辱妇女", "生产、销售伪劣农药、兽药、化肥、种子", "妨害公务", "容留他人吸毒", "拐骗儿童", "强制猥亵、侮辱妇女",
"非法处置查封、扣押、冻结的财产", "骗取贷款、票据承兑、金融票证", "强迫他人吸毒", "非法拘禁", "非法处置查封、扣押、冻结的财产", "骗取贷款、票据承兑、金融票证", "强迫他人吸毒", "非法拘禁",
"非法携带枪支、弹药、管制刀具、危险物品危及公共安全", "绑架", "聚众斗殴", "破坏计算机信息系统", "非法携带枪支、弹药、管制刀具、危险物品危及公共安全", "绑架", "聚众斗殴", "破坏计算机信息系统",
"制造、贩卖、传播淫秽物品", "虐待被监管人", "贷款诈骗", "赌博", "徇私舞弊不征、少征税款", "制造、贩卖、传播淫秽物品", "虐待被监管人", "贷款诈骗", "赌博", "徇私舞弊不征、少征税款",
"盗窃、抢夺枪支、弹药、爆炸物、危险物质", "故意杀人", "介绍贿赂", "提供侵入、非法控制计算机信息系统程序、工具", "盗窃、抢夺枪支、弹药、爆炸物、危险物质", "故意杀人", "介绍贿赂", "提供侵入、非法控制计算机信息系统程序、工具",
"编造、故意传播虚假恐怖信息", "妨害作证", "强迫卖淫", "走私、贩卖、运输、制造毒品", "伪证", "拐卖妇女、儿童", "编造、故意传播虚假恐怖信息", "妨害作证", "强迫卖淫", "走私、贩卖、运输、制造毒品", "伪证", "拐卖妇女、儿童",
"过失损坏武器装备、军事设施、军事通信", "破坏广播电视设施、公用电信设施", "洗钱", "职务侵占", "倒卖车票、船票", "过失损坏武器装备、军事设施、军事通信", "破坏广播电视设施、公用电信设施", "洗钱", "职务侵占", "倒卖车票、船票",
"抢劫", "侵占", "掩饰、隐瞒犯罪所得、犯罪所得收益", "徇私舞弊不移交刑事案件", "引诱、教唆、欺骗他人吸毒", "遗弃", "抢劫", "侵占", "掩饰、隐瞒犯罪所得、犯罪所得收益", "徇私舞弊不移交刑事案件", "引诱、教唆、欺骗他人吸毒", "遗弃",
"生产、销售伪劣产品", "放火", "非法采矿", "对单位行贿", "盗窃、抢夺枪支、弹药、爆炸物", "破坏易燃易爆设备", "生产、销售伪劣产品", "放火", "非法采矿", "对单位行贿", "盗窃、抢夺枪支、弹药、爆炸物", "破坏易燃易爆设备",
"妨害信用卡管理", "制作、复制、出版、贩卖、传播淫秽物品牟利", "金融凭证诈骗", "私分国有资产", "妨害信用卡管理", "制作、复制、出版、贩卖、传播淫秽物品牟利", "金融凭证诈骗", "私分国有资产",
"走私国家禁止进出口的货物、物品", "假冒注册商标", "危险物品肇事", "走私普通货物、物品", "经济犯", "虚报注册资本", "走私国家禁止进出口的货物、物品", "假冒注册商标", "危险物品肇事", "走私普通货物、物品", "经济犯", "虚报注册资本",
"盗掘古文化遗址、古墓葬", "传播淫秽物品", "窝藏、包庇", "拒不支付劳动报酬", "行贿", "开设赌场", "传授犯罪方法", "盗掘古文化遗址、古墓葬", "传播淫秽物品", "窝藏、包庇", "拒不支付劳动报酬", "行贿", "开设赌场", "传授犯罪方法",
"协助组织卖淫", "保险诈骗", "破坏生产经营", "破坏交通设施", "打击报复证人", "非法侵入住宅", "非国家工作人员受贿", "协助组织卖淫", "保险诈骗", "破坏生产经营", "破坏交通设施", "打击报复证人", "非法侵入住宅", "非国家工作人员受贿",
"过失致人重伤", "伪造、变造金融票证", "窝藏、转移、隐瞒毒品、毒赃", "帮助毁灭、伪造证据", "走私珍贵动物、珍贵动物制品", "过失致人重伤", "伪造、变造金融票证", "窝藏、转移、隐瞒毒品、毒赃", "帮助毁灭、伪造证据", "走私珍贵动物、珍贵动物制品",
"生产、销售假药", "逃税", "挪用特定款物", "聚众扰乱社会秩序", "组织、强迫、引诱、容留、介绍卖淫", "合同诈骗", "生产、销售假药", "逃税", "挪用特定款物", "聚众扰乱社会秩序", "组织、强迫、引诱、容留、介绍卖淫", "合同诈骗",
"非法生产、销售间谍专用器材", "破坏交通工具", "传播性病", "强迫交易", "隐匿、故意销毁会计凭证、会计帐簿、财务会计报告", "非法生产、销售间谍专用器材", "破坏交通工具", "传播性病", "强迫交易", "隐匿、故意销毁会计凭证、会计帐簿、财务会计报告",
"非法组织卖血", "强迫劳动", "破坏电力设备", "销售假冒注册商标的商品", "收买被拐卖的妇女、儿童", "诬告陷害", "脱逃", "非法组织卖血", "强迫劳动", "破坏电力设备", "销售假冒注册商标的商品", "收买被拐卖的妇女、儿童", "诬告陷害", "脱逃",
"非法经营", "徇私枉法", "信用卡诈骗", "生产、销售不符合安全标准的食品", "非法行医", "伪造货币", "动植物检疫徇私舞弊", "非法经营", "徇私枉法", "信用卡诈骗", "生产、销售不符合安全标准的食品", "非法行医", "伪造货币", "动植物检疫徇私舞弊",
"单位行贿", "破坏监管秩序", "盗窃", "盗伐林木", "重大劳动安全事故", "非法吸收公众存款", "单位行贿", "破坏监管秩序", "盗窃", "盗伐林木", "重大劳动安全事故", "非法吸收公众存款",
"非法制造、出售非法制造的发票", "非法狩猎", "组织卖淫", "非法买卖、运输、携带、持有毒品原植物种子、幼苗", "挪用资金", "非法制造、出售非法制造的发票", "非法狩猎", "组织卖淫", "非法买卖、运输、携带、持有毒品原植物种子、幼苗", "挪用资金",
"诈骗", "伪造、变造、买卖国家机关公文、证件、印章", "持有伪造的发票", "贪污", "非法生产、买卖警用装备", "诈骗", "伪造、变造、买卖国家机关公文、证件、印章", "持有伪造的发票", "贪污", "非法生产、买卖警用装备",
"投放危险物质", "伪造、倒卖伪造的有价票证", "集资诈骗", "抢夺", "生产、销售有毒、有害食品", "非法捕捞水产品", "投放危险物质", "伪造、倒卖伪造的有价票证", "集资诈骗", "抢夺", "生产、销售有毒、有害食品", "非法捕捞水产品",
"过失致人死亡", "非法买卖制毒物品", "虚开增值税专用发票、用于骗取出口退税、抵扣税款发票", "寻衅滋事", "危险驾驶", "过失致人死亡", "非法买卖制毒物品", "虚开增值税专用发票、用于骗取出口退税、抵扣税款发票", "寻衅滋事", "危险驾驶",
"故意毁坏财物", "招摇撞骗", "盗窃、侮辱尸体", "走私武器、弹药", "故意毁坏财物", "招摇撞骗", "盗窃、侮辱尸体", "走私武器、弹药",
"非法收购、运输、加工、出售国家重点保护植物、国家重点保护植物制品", "非法出售发票", "劫持船只、汽车", "非法收购、运输、加工、出售国家重点保护植物、国家重点保护植物制品", "非法出售发票", "劫持船只、汽车",
"受贿", "聚众哄抢", "交通肇事"] "受贿", "聚众哄抢", "交通肇事"]
def compute_ljp_accusation(data_dict): def compute_ljp_accusation(data_dict):
""" """
Compute the F1-score Compute the F1-score
The LJP_Accusation dataset a set of 189 different accusation types. The LJP_Accusation dataset a set of 189 different accusation types.
A question may involve one or more accusation types. A question may involve one or more accusation types.
Given a list of accusation types from both the ground truth and the prediction, we compute the F1-score between Given a list of accusation types from both the ground truth and the prediction, we compute the F1-score between
these two lists. these two lists.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("罪名:"), f"answer: {answer} \n question: {question}" assert answer.startswith("罪名:"), f"answer: {answer} \n question: {question}"
answer = answer.replace("罪名:", "") answer = answer.replace("罪名:", "")
answers = answer.split(";") answers = answer.split(";")
prediction_list =[] prediction_list =[]
for option in option_list: for option in option_list:
if option in prediction: if option in prediction:
prediction_list.append(option) prediction_list.append(option)
if len(prediction_list) == 0: if len(prediction_list) == 0:
abstentions += 1 abstentions += 1
gt_set = set(answers) gt_set = set(answers)
pred_set = set(prediction_list) pred_set = set(prediction_list)
score = compute_f1_two_sets(gt_set, pred_set) score = compute_f1_two_sets(gt_set, pred_set)
score_list.append(score) score_list.append(score)
f1_score_average = sum(score_list) / len(score_list) f1_score_average = sum(score_list) / len(score_list)
return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)} return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)}
import re import re
import cn2an import cn2an
""" """
task: law article prediction task: law article prediction
metric: F1 score metric: F1 score
法律判决预测-法条预测 法律判决预测-法条预测
""" """
def replace_match(match): def replace_match(match):
return match.group(1) return match.group(1)
def compute_ljp_article(data_dict): def compute_ljp_article(data_dict):
""" """
Compute the F1-score Compute the F1-score
A reference contains a list of articles of the Criminal Law of the People's Republic of China. A reference contains a list of articles of the Criminal Law of the People's Republic of China.
We compute the F1-score between the prediction and the reference. We compute the F1-score between the prediction and the reference.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("法条:刑法第"), f"answer: {answer}" assert answer.startswith("法条:刑法第"), f"answer: {answer}"
assert answer.endswith("条"), f"answer: {answer}" assert answer.endswith("条"), f"answer: {answer}"
answer = answer.replace("法条:刑法第", "") answer = answer.replace("法条:刑法第", "")
answer = answer.replace("条", "") answer = answer.replace("条", "")
answer_law_indices = answer.split("、") answer_law_indices = answer.split("、")
answer_law_index_digit_list = [] answer_law_index_digit_list = []
for answer_law_index in answer_law_indices: for answer_law_index in answer_law_indices:
assert answer_law_index.isdigit(), f"answer_law_index: {answer_law_index}" assert answer_law_index.isdigit(), f"answer_law_index: {answer_law_index}"
answer_law_index_digit = int(answer_law_index) answer_law_index_digit = int(answer_law_index)
assert answer_law_index_digit <= 490, "刑法总共只有490条" assert answer_law_index_digit <= 490, "刑法总共只有490条"
answer_law_index_digit_list.append(answer_law_index_digit) answer_law_index_digit_list.append(answer_law_index_digit)
prediction_law_chunks = prediction.split("、") prediction_law_chunks = prediction.split("、")
prediction_law_index_digit_list = [] prediction_law_index_digit_list = []
for prediction_law_chunk in prediction_law_chunks: for prediction_law_chunk in prediction_law_chunks:
prediction_law_chunk = prediction_law_chunk.replace("万元", "元") prediction_law_chunk = prediction_law_chunk.replace("万元", "元")
# delete phrase starts with "第" and ends with "款", we don't care about it in the answer # delete phrase starts with "第" and ends with "款", we don't care about it in the answer
prediction_law_chunk = re.sub(r'第(.*?)款', "", prediction_law_chunk) prediction_law_chunk = re.sub(r'第(.*?)款', "", prediction_law_chunk)
# keep only the digits in the phrase starts with "第" and ends with "条", otherwise cn may fail to convert # keep only the digits in the phrase starts with "第" and ends with "条", otherwise cn may fail to convert
prediction_law_chunk = re.sub(r'第(.*?)条', replace_match, prediction_law_chunk) prediction_law_chunk = re.sub(r'第(.*?)条', replace_match, prediction_law_chunk)
prediction_law_chunk = cn2an.transform(prediction_law_chunk, "cn2an") prediction_law_chunk = cn2an.transform(prediction_law_chunk, "cn2an")
# find digtis in prediction_law_chunk # find digtis in prediction_law_chunk
prediction_law_section_numbers = re.findall(r"\d+", prediction_law_chunk) prediction_law_section_numbers = re.findall(r"\d+", prediction_law_chunk)
if len(prediction_law_section_numbers) == 0: if len(prediction_law_section_numbers) == 0:
continue continue
if len(prediction_law_section_numbers) != 1: if len(prediction_law_section_numbers) != 1:
# in this case, we only take the first number, and reject the others # in this case, we only take the first number, and reject the others
pass pass
prediction_law_index_digit = int(prediction_law_section_numbers[0]) prediction_law_index_digit = int(prediction_law_section_numbers[0])
prediction_law_index_digit_list.append(prediction_law_index_digit) prediction_law_index_digit_list.append(prediction_law_index_digit)
gt_set = set(answer_law_index_digit_list) gt_set = set(answer_law_index_digit_list)
pred_set = set(prediction_law_index_digit_list) pred_set = set(prediction_law_index_digit_list)
if len(pred_set) == 0: if len(pred_set) == 0:
abstentions += 1 abstentions += 1
precision = len(gt_set.intersection(pred_set)) / len(pred_set) if len(pred_set) != 0 else 0 precision = len(gt_set.intersection(pred_set)) / len(pred_set) if len(pred_set) != 0 else 0
recall = len(gt_set.intersection(pred_set)) / len(gt_set) if len(gt_set) != 0 else 0 recall = len(gt_set.intersection(pred_set)) / len(gt_set) if len(gt_set) != 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0 f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
score_list.append(f1_score) score_list.append(f1_score)
# compute the accuracy of score_list # compute the accuracy of score_list
average_f1 = sum(score_list) / len(score_list) average_f1 = sum(score_list) / len(score_list)
return {'score': average_f1, 'abstention_rate': abstentions/len(data_dict)} return {'score': average_f1, 'abstention_rate': abstentions/len(data_dict)}
import math import math
import cn2an import cn2an
import re import re
#法律判决预测-刑期预测 #法律判决预测-刑期预测
def compute_ljp_imprison(data_dict): def compute_ljp_imprison(data_dict):
score_list, abstentions = [], 0 score_list, abstentions = [], 0
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
# get answer digit, which is the number between "刑期:" and "个月" # get answer digit, which is the number between "刑期:" and "个月"
if "死刑" in answer or "无期" in answer: if "死刑" in answer or "无期" in answer:
# TODO: data imperfection # TODO: data imperfection
continue continue
assert answer.startswith("刑期:") and answer.endswith("个月"), f"answer: {answer}, question: {question}" assert answer.startswith("刑期:") and answer.endswith("个月"), f"answer: {answer}, question: {question}"
answer = answer.replace("刑期:", "") answer = answer.replace("刑期:", "")
answer = answer.replace("个月", "") answer = answer.replace("个月", "")
answer_digit = int(answer) answer_digit = int(answer)
prediction = cn2an.transform(prediction, "cn2an") prediction = cn2an.transform(prediction, "cn2an")
# use regular expression to extract the digits from prediction, only consider digits before "个月" or "月" # use regular expression to extract the digits from prediction, only consider digits before "个月" or "月"
prediction_digit_month_list = re.findall(r"\d+个月", prediction) prediction_digit_month_list = re.findall(r"\d+个月", prediction)
prediction_digit_month_list = [int(digit.replace("个月", "")) for digit in prediction_digit_month_list] prediction_digit_month_list = [int(digit.replace("个月", "")) for digit in prediction_digit_month_list]
prediction_digit_month_list2 = re.findall(r"\d+月", prediction) prediction_digit_month_list2 = re.findall(r"\d+月", prediction)
prediction_digit_month_list2 = [int(digit.replace("月", "")) for digit in prediction_digit_month_list2] prediction_digit_month_list2 = [int(digit.replace("月", "")) for digit in prediction_digit_month_list2]
prediction_digit_month_list.extend(prediction_digit_month_list2) prediction_digit_month_list.extend(prediction_digit_month_list2)
# catches the digits before "年" # catches the digits before "年"
prediction_digit_year_list = re.findall(r"\d+年", prediction) prediction_digit_year_list = re.findall(r"\d+年", prediction)
prediction_digit_year_list = [int(digit.replace("年", "")) for digit in prediction_digit_year_list] prediction_digit_year_list = [int(digit.replace("年", "")) for digit in prediction_digit_year_list]
if len(prediction_digit_month_list) > 0: if len(prediction_digit_month_list) > 0:
prediction_digit_month = int(prediction_digit_month_list[0]) prediction_digit_month = int(prediction_digit_month_list[0])
elif len(prediction_digit_year_list) > 0: elif len(prediction_digit_year_list) > 0:
prediction_digit_month = int(prediction_digit_year_list[0]) * 12 prediction_digit_month = int(prediction_digit_year_list[0]) * 12
else: else:
abstentions += 1 abstentions += 1
prediction_digit_month = -1 prediction_digit_month = -1
if prediction_digit_month != -1: if prediction_digit_month != -1:
score_list.append(abs(math.log(answer_digit + 1) - math.log(prediction_digit_month + 1))) score_list.append(abs(math.log(answer_digit + 1) - math.log(prediction_digit_month + 1)))
else: else:
score_list.append(math.log(216)) score_list.append(math.log(216))
# compute the average of score_list (log distance) # compute the average of score_list (log distance)
log_distance = sum(score_list) / len(score_list) log_distance = sum(score_list) / len(score_list)
# normalize the score to between 0 and 1 # normalize the score to between 0 and 1
log_distance = (math.log(216) - log_distance)/math.log(216) log_distance = (math.log(216) - log_distance)/math.log(216)
return {"score": log_distance, "abstention_rate": abstentions/len(data_dict)} return {"score": log_distance, "abstention_rate": abstentions/len(data_dict)}
from ..utils.function_utils import compute_f1_two_sets from ..utils.function_utils import compute_f1_two_sets
from ..utils.rc_f1 import CJRCEvaluator from ..utils.rc_f1 import CJRCEvaluator
""" """
task: event detection task: event detection
metric: F1 score metric: F1 score
事件检测 事件检测
""" """
option_list = ["支付/给付", "欺骗", "搜查/扣押", "要求/请求", "卖出", "买入", "获利", "拘捕", "鉴定", "同意/接受", "供述", "联络", "帮助/救助", "租用/借用", "受伤", "伪造", "卖淫", "伤害人身", "赔偿", "归还/偿还"] option_list = ["支付/给付", "欺骗", "搜查/扣押", "要求/请求", "卖出", "买入", "获利", "拘捕", "鉴定", "同意/接受", "供述", "联络", "帮助/救助", "租用/借用", "受伤", "伪造", "卖淫", "伤害人身", "赔偿", "归还/偿还"]
def compute_sjjc(data_dict): def compute_sjjc(data_dict):
""" """
Compute the F1-score Compute the F1-score
The sjjc task covers 20 event types. The sjjc task covers 20 event types.
A question may involve one or more event types. A question may involve one or more event types.
Given a list of event types from both the ground truth and the prediction, we compute the F1-score between Given a list of event types from both the ground truth and the prediction, we compute the F1-score between
these two lists. these two lists.
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
answers = answer.split(";") answers = answer.split(";")
prediction_list =[] prediction_list =[]
for option in option_list: for option in option_list:
if option in prediction: if option in prediction:
prediction_list.append(option) prediction_list.append(option)
if len(prediction_list) == 0: if len(prediction_list) == 0:
abstentions += 1 abstentions += 1
gt_set = set(answers) gt_set = set(answers)
pred_set = set(prediction_list) pred_set = set(prediction_list)
score = compute_f1_two_sets(gt_set, pred_set) score = compute_f1_two_sets(gt_set, pred_set)
score_list.append(score) score_list.append(score)
f1_score_average = sum(score_list) / len(score_list) f1_score_average = sum(score_list) / len(score_list)
return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)} return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)}
""" """
task: trigger word extraction task: trigger word extraction
metric: F1 score metric: F1 score
触发词抽取 触发词抽取
""" """
def compute_cfcy(data_dict): def compute_cfcy(data_dict):
scores = 0 scores = 0
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
answers = answer.split(";") answers = answer.split(";")
predictions = prediction.split(";") predictions = prediction.split(";")
intersected = [CJRCEvaluator.compute_f1(r, h) for r, h in zip(answers, predictions)] intersected = [CJRCEvaluator.compute_f1(r, h) for r, h in zip(answers, predictions)]
prec = sum(intersected) / len(predictions) if len(predictions) > 0 else 0 prec = sum(intersected) / len(predictions) if len(predictions) > 0 else 0
rec = sum(intersected) / len(answers) if len(answers) > 0 else 0 rec = sum(intersected) / len(answers) if len(answers) > 0 else 0
# print(prec, rec, intersected) # print(prec, rec, intersected)
scores += 2 * prec * rec / (prec + rec + 1e-10) scores += 2 * prec * rec / (prec + rec + 1e-10)
f1_score_average = scores / len(data_dict) f1_score_average = scores / len(data_dict)
return {"score": f1_score_average} return {"score": f1_score_average}
""" """
task: multiple choice classification task: multiple choice classification
metric: F1 score metric: F1 score
婚姻文本分类 婚姻文本分类
""" """
def compute_wbfl(data_dict): def compute_wbfl(data_dict):
""" """
A reference (R) contains a list of options, each option is from the option_list. A reference (R) contains a list of options, each option is from the option_list.
We will extract the options appearing in the prediction and convert them into a set (P). We will extract the options appearing in the prediction and convert them into a set (P).
We compute the F1 score between the prediction (P) and the reference (R). We compute the F1 score between the prediction (P) and the reference (R).
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
option_list = ["婚后有子女", "限制行为能力子女抚养", "有夫妻共同财产", "支付抚养费", "不动产分割", "婚后分局", option_list = ["婚后有子女", "限制行为能力子女抚养", "有夫妻共同财产", "支付抚养费", "不动产分割", "婚后分局",
"二次起诉离婚", "按月给付抚养费", "准予离婚", "有夫妻共同债务", "婚前个人财产", "法定离婚", "不履行家庭义务", "二次起诉离婚", "按月给付抚养费", "准予离婚", "有夫妻共同债务", "婚前个人财产", "法定离婚", "不履行家庭义务",
"存在非婚生子", "适当帮助", "不履行离婚协议", "损害赔偿", "感情不和分居满二年", "子女随非抚养权人生活", "婚后个人财产"] "存在非婚生子", "适当帮助", "不履行离婚协议", "损害赔偿", "感情不和分居满二年", "子女随非抚养权人生活", "婚后个人财产"]
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("类别:") and answer.endswith("。"), f"answer: {answer}, question: {question}" assert answer.startswith("类别:") and answer.endswith("。"), f"answer: {answer}, question: {question}"
gt_list = (answer[3:-1].split("、")) gt_list = (answer[3:-1].split("、"))
for gt in gt_list: for gt in gt_list:
assert gt in option_list, f"gt: {gt}, question: {question}" assert gt in option_list, f"gt: {gt}, question: {question}"
gt_set = set(gt_list) gt_set = set(gt_list)
prediction_list = [] prediction_list = []
for option in option_list: for option in option_list:
if option in prediction: if option in prediction:
prediction_list.append(option) prediction_list.append(option)
if len(prediction_list) == 0: if len(prediction_list) == 0:
abstentions += 1 abstentions += 1
predict_set = set(prediction_list) predict_set = set(prediction_list)
precision = len(gt_set.intersection(predict_set)) / len(predict_set) if len(predict_set) != 0 else 0 precision = len(gt_set.intersection(predict_set)) / len(predict_set) if len(predict_set) != 0 else 0
recall = len(gt_set.intersection(predict_set)) / len(gt_set) if len(gt_set) != 0 else 0 recall = len(gt_set.intersection(predict_set)) / len(gt_set) if len(gt_set) != 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0 f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
score_list.append(f1_score) score_list.append(f1_score)
# compute the accuracy of score_list # compute the accuracy of score_list
final_f1_score = sum(score_list) / len(score_list) final_f1_score = sum(score_list) / len(score_list)
return {'score': final_f1_score, 'abstention_rate': abstentions / len(data_dict)} return {'score': final_f1_score, 'abstention_rate': abstentions / len(data_dict)}
import re import re
import os import os
import subprocess import subprocess
""" """
Task: legal document grammar correction Task: legal document grammar correction
Metric: F0.5 score Metric: F0.5 score
文书校对 文书校对
""" """
def compute_wsjd(data_dict): def compute_wsjd(data_dict):
origins, references, predictions = [], [], [] origins, references, predictions = [], [], []
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
if isinstance(question, list): if isinstance(question, list):
question = question[0]['prompt'] question = question[0]['prompt']
start = question.index('句子:\n') + 4 start = question.index('句子:\n') + 4
origins.append(re.sub(r'\n|\t', '', question[start:].split('\n')[0])) origins.append(re.sub(r'\n|\t', '', question[start:].split('\n')[0]))
# truncate predictions >5 tokens longer than the reference # truncate predictions >5 tokens longer than the reference
prediction = re.sub(r'\n|\t', '', prediction) prediction = re.sub(r'\n|\t', '', prediction)
if len(prediction) - len(answer) > 5: if len(prediction) - len(answer) > 5:
prediction = prediction[:len(answer) + 5] prediction = prediction[:len(answer) + 5]
if len(prediction) == 0: if len(prediction) == 0:
prediction = "无内容" prediction = "无内容"
predictions.append(prediction) predictions.append(prediction)
references.append(re.sub(r'\n|\t', '', answer)) references.append(re.sub(r'\n|\t', '', answer))
#generate input files for ChERRANT #generate input files for ChERRANT
preds = [f'{i} \t {origin} \t {prediction} \n' for i, (origin, prediction) in enumerate(zip(origins, predictions))] preds = [f'{i} \t {origin} \t {prediction} \n' for i, (origin, prediction) in enumerate(zip(origins, predictions))]
golds = [f'{i} \t {origin} \t {reference} \n' for i, (origin, reference) in enumerate(zip(origins, references))] golds = [f'{i} \t {origin} \t {reference} \n' for i, (origin, reference) in enumerate(zip(origins, references))]
now_path = os.path.abspath(os.getcwd()) now_path = os.path.abspath(os.getcwd())
utils_path = os.path.abspath(os.path.join(__file__, '..', '..', 'utils')) utils_path = os.path.abspath(os.path.join(__file__, '..', '..', 'utils'))
uid = os.getuid() uid = os.getuid()
os.chdir(utils_path) os.chdir(utils_path)
with open(f'/tmp/tmp_pred_{uid}.para', 'w') as f: with open(f'/tmp/tmp_pred_{uid}.para', 'w') as f:
f.writelines(preds) f.writelines(preds)
with open(f'/tmp/tmp_gold_{uid}.para', 'w') as f: with open(f'/tmp/tmp_gold_{uid}.para', 'w') as f:
f.writelines(golds) f.writelines(golds)
os.environ['KMP_DUPLICATE_LIB_OK']='True' os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.system(f'python3 parallel_to_m2.py -f /tmp/tmp_pred_{uid}.para -o /tmp/tmp_pred_{uid}.para.m2 -g char') os.system(f'python3 parallel_to_m2.py -f /tmp/tmp_pred_{uid}.para -o /tmp/tmp_pred_{uid}.para.m2 -g char')
os.system(f'python3 parallel_to_m2.py -f /tmp/tmp_gold_{uid}.para -o /tmp/tmp_gold_{uid}.para.m2 -g char') os.system(f'python3 parallel_to_m2.py -f /tmp/tmp_gold_{uid}.para -o /tmp/tmp_gold_{uid}.para.m2 -g char')
output = subprocess.check_output(f"python3 compare_m2_for_evaluation.py -hyp /tmp/tmp_pred_{uid}.para.m2 -ref /tmp/tmp_gold_{uid}.para.m2", shell = True) output = subprocess.check_output(f"python3 compare_m2_for_evaluation.py -hyp /tmp/tmp_pred_{uid}.para.m2 -ref /tmp/tmp_gold_{uid}.para.m2", shell = True)
score = float(output.decode().split('\t')[-1].split('\n')[0]) score = float(output.decode().split('\t')[-1].split('\n')[0])
#remove prediction files #remove prediction files
os.remove(f'/tmp/tmp_pred_{uid}.para') os.remove(f'/tmp/tmp_pred_{uid}.para')
os.remove(f'/tmp/tmp_gold_{uid}.para') os.remove(f'/tmp/tmp_gold_{uid}.para')
os.remove(f'/tmp/tmp_pred_{uid}.para.m2') os.remove(f'/tmp/tmp_pred_{uid}.para.m2')
os.remove(f'/tmp/tmp_gold_{uid}.para.m2') os.remove(f'/tmp/tmp_gold_{uid}.para.m2')
os.chdir(now_path) os.chdir(now_path)
return {"score": score} return {"score": score}
from ..utils.comprehension_scores import compute_ie_f1 from ..utils.comprehension_scores import compute_ie_f1
""" """
task: information extraction task: information extraction
metric: F1 score metric: F1 score
信息抽取 信息抽取
""" """
def compute_xxcq(data_dict): def compute_xxcq(data_dict):
references, predictions = [], [] references, predictions = [], []
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction) predictions.append(prediction)
references.append(answer) references.append(answer)
return compute_ie_f1(predictions, references, {"犯罪嫌疑人", "受害人", "被盗货币", "物品价值", "盗窃获利", return compute_ie_f1(predictions, references, {"犯罪嫌疑人", "受害人", "被盗货币", "物品价值", "盗窃获利",
"被盗物品", "作案工具", "时间", "地点", "组织机构"}) "被盗物品", "作案工具", "时间", "地点", "组织机构"})
from ..utils.comprehension_scores import compute_rc_f1 from ..utils.comprehension_scores import compute_rc_f1
""" """
Task: machine reading comprehension Task: machine reading comprehension
Metric: F1 score Metric: F1 score
法律阅读理解 法律阅读理解
""" """
def compute_ydlj(data_dict): def compute_ydlj(data_dict):
references, predictions = [], [] references, predictions = [], []
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
answer = answer.replace("回答:", "") answer = answer.replace("回答:", "")
predictions.append(prediction) predictions.append(prediction)
references.append(answer) references.append(answer)
f1_score = compute_rc_f1(predictions, references) f1_score = compute_rc_f1(predictions, references)
return f1_score return f1_score
from ..utils.function_utils import compute_rouge from ..utils.function_utils import compute_rouge
#舆情摘要 #舆情摘要
def compute_yqzy(data_dict): def compute_yqzy(data_dict):
""" """
Compute the ROUGE-L score between the prediction and the reference Compute the ROUGE-L score between the prediction and the reference
""" """
references, predictions = [], [] references, predictions = [], []
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction) predictions.append(prediction)
references.append(answer) references.append(answer)
# compute the accuracy of score_list # compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references) rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls) average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l} return {"score": average_rouge_l}
from ..utils.function_utils import multi_choice_judge from ..utils.function_utils import multi_choice_judge
""" """
task: multiple choice classification task: multiple choice classification
metric: accuracy metric: accuracy
咨询分类 咨询分类
""" """
def compute_zxfl(data_dict): def compute_zxfl(data_dict):
""" """
A reference (R) contains a list of options, each option is from the option_list. A reference (R) contains a list of options, each option is from the option_list.
We will extract the options appearing in the prediction and convert them into a set (P). We will extract the options appearing in the prediction and convert them into a set (P).
We compute the accuracy between the prediction (P) and the reference (R). We compute the accuracy between the prediction (P) and the reference (R).
""" """
score_list, abstentions = [], 0 score_list, abstentions = [], 0
option_list = ['婚姻家庭', '劳动纠纷', '交通事故', '债权债务', '刑事辩护', '合同纠纷', '房产纠纷', '侵权', '公司法', '医疗纠纷', '拆迁安置', '行政诉讼', '建设工程', '知识产权', '综合咨询', '人身损害', '涉外法律', '海事海商', '消费权益', '抵押担保'] option_list = ['婚姻家庭', '劳动纠纷', '交通事故', '债权债务', '刑事辩护', '合同纠纷', '房产纠纷', '侵权', '公司法', '医疗纠纷', '拆迁安置', '行政诉讼', '建设工程', '知识产权', '综合咨询', '人身损害', '涉外法律', '海事海商', '消费权益', '抵押担保']
for example in data_dict: for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
judge = multi_choice_judge(prediction, option_list, answer) judge = multi_choice_judge(prediction, option_list, answer)
score_list.append(judge["score"]) score_list.append(judge["score"])
abstentions += judge["abstention"] abstentions += judge["abstention"]
# compute the accuracy of score_list # compute the accuracy of score_list
final_accuracy_score = sum(score_list) / len(score_list) final_accuracy_score = sum(score_list) / len(score_list)
return {'score': final_accuracy_score, 'abstention_rate': abstentions / len(data_dict)} return {'score': final_accuracy_score, 'abstention_rate': abstentions / len(data_dict)}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment