Unverified Commit 3f37c40a authored by philipwangOvO's avatar philipwangOvO Committed by GitHub
Browse files

[Dataset] Refactor LEval

parent 60c2d3d7
......@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
from opencompass.datasets import LEvalTopicRetrievalDataset
from opencompass.datasets.leval import LEvalTopicRetrievalDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi, general_postprocess
LEval_tr_reader_cfg = dict(
......@@ -16,8 +16,11 @@ LEval_tr_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt='Below is a record of our previous conversation on many different topics. You are the ASSISTANT, and I am the USER. At the beginning of each topic, the USER will say \'I would like to discuss the topic of <TOPIC>\'. Memorize each <TOPIC>. At the end of the record, I will ask you to retrieve the first/second/third topic names. Now the record start.'),
],
round=[
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
dict(role='HUMAN', prompt='Document is as follows.\n{context}\nQuestion:{question}\nAnswer:'),
dict(role='BOT', prompt=''),
], )),
retriever=dict(type=ZeroRetriever),
......
from mmengine.config import read_base
with read_base():
from .LEval_tpo_gen_bd35f4 import LEval_tpo_datasets
from .leval_tpo_gen_36a006 import LEval_tpo_datasets
......@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
from opencompass.datasets import LEvalTPODataset
from opencompass.datasets.leval import LEvalTPODataset
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
LEval_tpo_reader_cfg = dict(
......@@ -16,8 +16,11 @@ LEval_tpo_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt='Now you are given a very long document. Please follow the instruction based on this document. For multi-choice questions, there could be a single correct option or multiple correct options. Please only provide the letter corresponding to the answer (like A or AB) when answering.'),
],
round=[
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
dict(role='HUMAN', prompt='Document is as follows.\n{context}\nQuestion:{question}\nAnswer:'),
dict(role='BOT', prompt=''),
], )),
retriever=dict(type=ZeroRetriever),
......
from mmengine.config import read_base
with read_base():
from .leval_tvshow_summ_gen_b03798 import LEval_tvshow_summ_datasets # noqa: F401, F403
......@@ -2,10 +2,10 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
from opencompass.datasets import LEvalTVShowSummDataset
from opencompass.datasets.leval import LEvalGPTEvaluator, LEvalTVShowSummDataset
LEval_tvshow_summ_reader_cfg = dict(
input_columns=['context', 'question'],
input_columns=['context', 'question', 'length'],
output_column='answer',
train_split='test',
test_split='test'
......@@ -15,8 +15,11 @@ LEval_tvshow_summ_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt='Now you are given a very long document. Please follow the instruction after this document. These instructions may include summarizing a document, answering questions based on the document, or writing a required paragraph.'),
],
round=[
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nTL;DR:'),
dict(role='HUMAN', prompt='Document is as follows. {context}\nInstruction: {question}\nAnswer this question with {length} words.'),
dict(role='BOT', prompt=''),
], )),
retriever=dict(type=ZeroRetriever),
......@@ -24,7 +27,7 @@ LEval_tvshow_summ_infer_cfg = dict(
)
LEval_tvshow_summ_eval_cfg = dict(
evaluator=dict(type=RougeEvaluator),
evaluator=dict(type=LEvalGPTEvaluator),
pred_role='BOT'
)
......
from mmengine.config import read_base
with read_base():
from .longbench2wikimqa.longbench_2wikimqa_gen import LongBench_2wikimqa_datasets
from .longbenchhotpotqa.longbench_hotpotqa_gen import LongBench_hotpotqa_datasets
from .longbenchmusique.longbench_musique_gen import LongBench_musique_datasets
from .longbenchmultifieldqa_en.longbench_multifieldqa_en_gen import LongBench_multifieldqa_en_datasets
from .longbenchmultifieldqa_zh.longbench_multifieldqa_zh_gen import LongBench_multifieldqa_zh_datasets
from .longbenchnarrativeqa.longbench_narrativeqa_gen import LongBench_narrativeqa_datasets
from .longbenchnq.longbench_nq_gen import LongBench_nq_datasets
from .longbenchqasper.longbench_qasper_gen import LongBench_qasper_datasets
from .longbenchtriviaqa.longbench_triviaqa_gen import LongBench_triviaqa_datasets
from .longbenchgov_report.longbench_gov_report_gen import LongBench_gov_report_datasets
from .longbenchqmsum.longbench_qmsum_gen import LongBench_qmsum_datasets
from .longbenchvcsum.longbench_vcsum_gen import LongBench_vcsum_datasets
from .longbenchdureader.longbench_dureader_gen import LongBench_dureader_datasets
from .longbenchlcc.longbench_lcc_gen import LongBench_lcc_datasets
from .longbenchrepobench.longbench_repobench_gen import LongBench_repobench_datasets
from .longbenchpassage_retrieval_en.longbench_passage_retrieval_en_gen import LongBench_passage_retrieval_en_datasets
from .longbenchpassage_retrieval_zh.longbench_passage_retrieval_zh_gen import LongBench_passage_retrieval_zh_datasets
from .longbenchpassage_count.longbench_passage_count_gen import LongBench_passage_count_datasets
from .longbenchtrec.longbench_trec_gen import LongBench_trec_datasets
from .longbenchlsht.longbench_lsht_gen import LongBench_lsht_datasets
longbench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
\ No newline at end of file
from opencompass.models import HuggingFaceCausalLM
import torch
# long context evaluation tasks
with read_base():
from .datasets.LEvalNaturalQuestion.LEval_naturalquestion_gen import LEval_nq_datasets
from .datasets.LEvalNarrativeQA.LEval_narrativeqa_gen import LEval_narrativeqa_datasets
from .datasets.LEvalMultidocQA.LEval_multidocqa_gen import LEval_multidocqa_datasets
from .datasets.LEvalCoursera.LEval_coursera_gen import LEval_coursera_datasets
from .datasets.LEvalTPO.LEval_tpo_gen import LEval_tpo_datasets
from .datasets.LEvalQuality.LEval_quality_gen import LEval_quality_datasets
from .datasets.LEvalGSM100.LEval_gsm100_gen import LEval_gsm100_datasets
from .datasets.LEvalTopicRetrieval.LEval_topic_retrieval_gen import LEval_tr_datasets
from .datasets.LEvalFinancialQA.LEval_financialqa_gen import LEval_financialqa_datasets
from .datasets.LEvalGovReportSumm.LEval_gov_report_summ_gen import LEval_govreport_summ_datasets
from .datasets.LEvalLegalContractQA.LEval_legalcontractqa_gen import LEval_legalqa_datasets
from .datasets.LEvalMeetingSumm.LEval_meetingsumm_gen import LEval_meetingsumm_datasets
from .datasets.LEvalNewsSumm.LEval_newssumm_gen import LEval_newssumm_datasets
from .datasets.LEvalPaperAssistant.LEval_paper_assistant_gen import LEval_ps_summ_datasets
from .datasets.LEvalPatentSumm.LEval_patent_summ_gen import LEval_patent_summ_datasets
from .datasets.LEvalTVShowSumm.LEval_tvshow_summ_gen import LEval_tvshow_summ_datasets
from .datasets.LEvalScientificQA.LEval_scientificqa_gen import LEval_scientificqa_datasets
from .datasets.LEvalReviewSumm.LEval_review_summ_gen import LEval_review_summ_datasets
# choose a model of interest
# ininternlm as an example
from .models.hf_internlm_7b import models
# and output the results in a choosen format
from .summarizers.LEval import summarizer
datasets = [*LEval_coursera_datasets,
*LEval_tpo_datasets,
*LEval_quality_datasets,
*LEval_gsm100_datasets,
*LEval_tr_datasets,
*LEval_financialqa_datasets,
*LEval_govreport_summ_datasets,
*LEval_legalqa_datasets,
*LEval_meetingsumm_datasets,
*LEval_multidocqa_datasets,
*LEval_narrativeqa_datasets,
*LEval_nq_datasets,
*LEval_newssumm_datasets,
*LEval_newssumm_datasets,
*LEval_patent_summ_datasets,
*LEval_tvshow_summ_datasets,
*LEval_scientificqa_datasets,
*LEval_review_summ_datasets,
*LEval_ps_summ_datasets]
......@@ -37,24 +37,7 @@ from .iwslt2017 import * # noqa: F401, F403
from .jigsawmultilingual import * # noqa: F401, F403
from .lambada import * # noqa: F401, F403
from .lcsts import * # noqa: F401, F403
from .LEval_coursera import * # noqa: F401, F403
from .LEval_financial_qa import * # noqa: F401, F403
from .LEval_gov_report_summ import * # noqa: F401, F403
from .LEval_gsm100 import * # noqa: F401, F403
from .LEval_legal_contract_qa import * # noqa: F401, F403
from .LEval_meeting_summ import * # noqa: F401, F403
from .LEval_multidoc_qa import * # noqa: F401, F403
from .LEval_narrattive_qa import * # noqa: F401, F403
from .LEval_natural_question import * # noqa: F401, F403
from .LEval_news_summ import * # noqa: F401, F403
from .LEval_paper_assistant import * # noqa: F401, F403
from .LEval_patent_summ import * # noqa: F401, F403
from .LEval_quality import * # noqa: F401, F403
from .LEval_review_summ import * # noqa: F401, F403
from .LEval_scientific_qa import * # noqa: F401, F403
from .LEval_topic_retrieval import * # noqa: F401, F403
from .LEval_tpo import * # noqa: F401, F403
from .LEval_tvshow_summ import * # noqa: F401, F403
from .leval import * # noqa: F401, F403
from .longbench import * # noqa: F401, F403
from .math import * # noqa: F401, F403
from .mbpp import * # noqa: F401, F403
......@@ -82,7 +65,7 @@ from .triviaqa import * # noqa: F401, F403
from .triviaqarc import * # noqa: F401, F403
from .truthfulqa import * # noqa: F401, F403
from .tydiqa import * # noqa: F401, F403
from .wic import * # noqa: F401, F4
from .wic import * # noqa: F401, F403
from .winograd import * # noqa: F401, F403
from .winogrande import * # noqa: F401, F403
from .wsc import * # noqa: F401, F403
......
from .evaluators import LEvalGPTEvaluator # noqa: F401, F403
from .leval_coursera import * # noqa: F401, F403
from .leval_financial_qa import * # noqa: F401, F403
from .leval_gov_report_summ import * # noqa: F401, F403
from .leval_gsm100 import * # noqa: F401, F403
from .leval_legal_contract_qa import * # noqa: F401, F403
from .leval_meeting_summ import * # noqa: F401, F403
from .leval_multidoc_qa import * # noqa: F401, F403
from .leval_narrattive_qa import * # noqa: F401, F403
from .leval_natural_question import * # noqa: F401, F403
from .leval_news_summ import * # noqa: F401, F403
from .leval_paper_assistant import * # noqa: F401, F403
from .leval_patent_summ import * # noqa: F401, F403
from .leval_quality import * # noqa: F401, F403
from .leval_review_summ import * # noqa: F401, F403
from .leval_scientific_qa import * # noqa: F401, F403
from .leval_topic_retrieval import * # noqa: F401, F403
from .leval_tpo import * # noqa: F401, F403
from .leval_tvshow_summ import * # noqa: F401, F403
This diff is collapsed.
import json
from typing import List
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS
from opencompass.utils.prompt import PromptList
@ICL_EVALUATORS.register_module()
class LEvalGPTEvaluator(BaseEvaluator):
"""Use OpenAI's models to evaluate prediction.
Args:
battle_model (str): The rival model name in evaluate module. Defaults
to 'turbo-16k-0613'.
evaluator_path (str): The judge model name in evaluate module. Note
that the key will be fetched from the environment variable
$OPENAI_API_KEY, as how openai defaults to be.
Defaults to 'gpt-4-0613'.
"""
def __init__(self,
battle_model: str = 'turbo-16k-0613',
evaluator_path: str = 'gpt-4-0613') -> None:
self.battle_model = battle_model
self.evaluator_path = evaluator_path
super().__init__()
def run_judge_pair(self, prompt_template, system_prompt, question,
answer_a, answer_b, reference):
from opencompass.models import OpenAI
user_prompt = prompt_template.format(question=question,
answer_a=answer_a,
answer_b=answer_b,
reference=reference)
messages = PromptList([{
'role': 'SYSTEM',
'fallback_role': 'HUMAN',
'prompt': system_prompt
}, {
'role': 'HUMAN',
'prompt': user_prompt
}])
model = OpenAI(path=self.evaluator_path,
max_seq_len=16384,
query_per_second=1,
retry=5,
temperature=0.0)
response = model._generate(input=messages,
max_out_len=2048,
temperature=0.0)
if '[[A]]' in response:
winner = 'A'
elif '[[B]]' in response:
winner = 'B'
elif '[[C]]' in response:
winner = 'tie'
else:
winner = 'error'
return winner
def score(self, predictions: List, references: List) -> dict:
system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question about the content of a long document. You will be given a reference answer written by human, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Additional details or information that are not mentioned in reference answer cannot be considered as advantages and do not let them sway your judgment. Your evaluation should also consider the relevance to user's question but it is more important to avoid factual errors according to the reference answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie." # noqa
prompt_template = "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{reference}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]" # noqa
battle_samples = []
with open(
'opencompass/datasets/leval/' + self.battle_model +
'.pred.jsonl', 'r') as f:
for i, line in enumerate(f):
battle_samples.append(json.loads(line))
score = 0.
bad_case = 0
num_samples = 0
for i in range(len(predictions)):
prediction = predictions[i]
reference = references[i]
for sample in battle_samples:
if reference == sample['gt']:
question = sample['query']
battle_answer = sample[self.battle_model + '_pred']
winner = self.run_judge_pair(prompt_template,
system_prompt, question,
prediction, battle_answer,
reference)
if winner == 'A':
score += 1
elif winner == 'tie':
score += 0.5
elif winner == 'error':
bad_case += 1
winner = self.run_judge_pair(prompt_template,
system_prompt, question,
battle_answer, prediction,
reference)
if winner == 'B':
score += 1
elif winner == 'tie':
score += 0.5
elif winner == 'error':
bad_case += 1
num_samples += 2
score = score / (num_samples - bad_case) * 100
return {'score': score}
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
from ..base import BaseDataset
@LOAD_DATASET.register_module()
......
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
from ..base import BaseDataset
@LOAD_DATASET.register_module()
......@@ -21,6 +21,7 @@ class LEvalFinancialQADataset(BaseDataset):
raw_data.append({
'question': question,
'context': context,
'length': len(answer.split()),
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
......
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
from ..base import BaseDataset
@LOAD_DATASET.register_module()
......@@ -21,6 +21,7 @@ class LEvalGovReportSummDataset(BaseDataset):
raw_data.append({
'question': question,
'context': context,
'length': len(answer.split()),
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
......
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset
from ..base import BaseDataset
@TEXT_POSTPROCESSORS.register_module('gsm100_dataset')
......
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
from ..base import BaseDataset
@LOAD_DATASET.register_module()
......@@ -21,6 +21,7 @@ class LEvalLegalContractQADataset(BaseDataset):
raw_data.append({
'question': question,
'context': context,
'length': len(answer.split()),
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
......
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
from ..base import BaseDataset
@LOAD_DATASET.register_module()
......@@ -21,6 +21,7 @@ class LEvalMeetingSummDataset(BaseDataset):
raw_data.append({
'question': question,
'context': context,
'length': len(answer.split()),
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
......
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
from ..base import BaseDataset
@LOAD_DATASET.register_module()
......@@ -21,6 +21,7 @@ class LEvalMultidocQADataset(BaseDataset):
raw_data.append({
'question': question,
'context': context,
'length': len(answer.split()),
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
......
......@@ -2,7 +2,7 @@ from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
from ..base import BaseDataset
@LOAD_DATASET.register_module()
......@@ -21,6 +21,7 @@ class LEvalNarrativeQADataset(BaseDataset):
raw_data.append({
'question': question,
'context': context,
'length': len(answer.split()),
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment