Unverified Commit d4d1330a authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Fix cmnli, fix vicuna meta template, fix longbench postprocess and other minor fixes (#625)

parent 5329724b
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset from opencompass.datasets import cmnliDataset
cmnli_reader_cfg = dict( cmnli_reader_cfg = dict(
input_columns=['sentence1', 'sentence2'], input_columns=['sentence1', 'sentence2'],
...@@ -25,11 +25,9 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) ...@@ -25,11 +25,9 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
cmnli_datasets = [ cmnli_datasets = [
dict( dict(
type=HFDataset, abbr="cmnli",
abbr='cmnli', type=cmnliDataset,
path='json', path='./data/CLUE/cmnli/cmnli_public/dev.json',
split='train',
data_files='./data/CLUE/cmnli/cmnli_public/dev.json',
reader_cfg=cmnli_reader_cfg, reader_cfg=cmnli_reader_cfg,
infer_cfg=cmnli_infer_cfg, infer_cfg=cmnli_infer_cfg,
eval_cfg=cmnli_eval_cfg) eval_cfg=cmnli_eval_cfg)
......
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset from opencompass.datasets import cmnliDataset
cmnli_reader_cfg = dict( cmnli_reader_cfg = dict(
input_columns=['sentence1', 'sentence2'], input_columns=['sentence1', 'sentence2'],
...@@ -41,11 +41,9 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) ...@@ -41,11 +41,9 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
cmnli_datasets = [ cmnli_datasets = [
dict( dict(
type=HFDataset, abbr="cmnli",
abbr='cmnli', type=cmnliDataset,
path='json', path='./data/CLUE/cmnli/cmnli_public/dev.json',
split='train',
data_files='./data/CLUE/cmnli/cmnli_public/dev.json',
reader_cfg=cmnli_reader_cfg, reader_cfg=cmnli_reader_cfg,
infer_cfg=cmnli_infer_cfg, infer_cfg=cmnli_infer_cfg,
eval_cfg=cmnli_eval_cfg) eval_cfg=cmnli_eval_cfg)
......
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset from opencompass.datasets import cmnliDataset
cmnli_reader_cfg = dict( cmnli_reader_cfg = dict(
input_columns=['sentence1', 'sentence2'], input_columns=['sentence1', 'sentence2'],
...@@ -45,11 +45,9 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) ...@@ -45,11 +45,9 @@ cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
cmnli_datasets = [ cmnli_datasets = [
dict( dict(
type=HFDataset, abbr="cmnli",
abbr='cmnli', type=cmnliDataset,
path='json', path='./data/CLUE/cmnli/cmnli_public/dev.json',
split='train',
data_files='./data/CLUE/cmnli/cmnli_public/dev.json',
reader_cfg=cmnli_reader_cfg, reader_cfg=cmnli_reader_cfg,
infer_cfg=cmnli_infer_cfg, infer_cfg=cmnli_infer_cfg,
eval_cfg=cmnli_eval_cfg) eval_cfg=cmnli_eval_cfg)
......
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchClassificationEvaluator, LongBenchlshtDataset from opencompass.datasets import LongBenchClassificationEvaluator, LongBenchlshtDataset, lsht_postprocess
LongBench_lsht_reader_cfg = dict( LongBench_lsht_reader_cfg = dict(
input_columns=['context', 'input'], input_columns=['context', 'input'],
...@@ -23,7 +23,8 @@ LongBench_lsht_infer_cfg = dict( ...@@ -23,7 +23,8 @@ LongBench_lsht_infer_cfg = dict(
LongBench_lsht_eval_cfg = dict( LongBench_lsht_eval_cfg = dict(
evaluator=dict(type=LongBenchClassificationEvaluator), evaluator=dict(type=LongBenchClassificationEvaluator),
pred_role='BOT' pred_role='BOT',
pred_postprocessor=dict(type=lsht_postprocess),
) )
LongBench_lsht_datasets = [ LongBench_lsht_datasets = [
......
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchRougeEvaluator, LongBenchsamsumDataset from opencompass.datasets import LongBenchRougeEvaluator, LongBenchsamsumDataset, samsum_postprocess
LongBench_samsum_reader_cfg = dict( LongBench_samsum_reader_cfg = dict(
input_columns=['context', 'input'], input_columns=['context', 'input'],
...@@ -23,7 +23,8 @@ LongBench_samsum_infer_cfg = dict( ...@@ -23,7 +23,8 @@ LongBench_samsum_infer_cfg = dict(
LongBench_samsum_eval_cfg = dict( LongBench_samsum_eval_cfg = dict(
evaluator=dict(type=LongBenchRougeEvaluator), evaluator=dict(type=LongBenchRougeEvaluator),
pred_role='BOT' pred_role='BOT',
pred_postprocessor=dict(type=samsum_postprocess),
) )
LongBench_samsum_datasets = [ LongBench_samsum_datasets = [
......
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchClassificationEvaluator, LongBenchtrecDataset from opencompass.datasets import LongBenchClassificationEvaluator, LongBenchtrecDataset, trec_postprocess
LongBench_trec_reader_cfg = dict( LongBench_trec_reader_cfg = dict(
input_columns=['context', 'input'], input_columns=['context', 'input'],
...@@ -23,7 +23,8 @@ LongBench_trec_infer_cfg = dict( ...@@ -23,7 +23,8 @@ LongBench_trec_infer_cfg = dict(
LongBench_trec_eval_cfg = dict( LongBench_trec_eval_cfg = dict(
evaluator=dict(type=LongBenchClassificationEvaluator), evaluator=dict(type=LongBenchClassificationEvaluator),
pred_role='BOT' pred_role='BOT',
pred_postprocessor=dict(type=trec_postprocess),
) )
LongBench_trec_datasets = [ LongBench_trec_datasets = [
......
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchF1Evaluator, LongBenchtriviaqaDataset from opencompass.datasets import LongBenchF1Evaluator, LongBenchtriviaqaDataset, triviaqa_postprocess
LongBench_triviaqa_reader_cfg = dict( LongBench_triviaqa_reader_cfg = dict(
input_columns=['context', 'input'], input_columns=['context', 'input'],
...@@ -23,7 +23,8 @@ LongBench_triviaqa_infer_cfg = dict( ...@@ -23,7 +23,8 @@ LongBench_triviaqa_infer_cfg = dict(
LongBench_triviaqa_eval_cfg = dict( LongBench_triviaqa_eval_cfg = dict(
evaluator=dict(type=LongBenchF1Evaluator), evaluator=dict(type=LongBenchF1Evaluator),
pred_role='BOT' pred_role='BOT',
pred_postprocessor=dict(type=triviaqa_postprocess),
) )
LongBench_triviaqa_datasets = [ LongBench_triviaqa_datasets = [
......
...@@ -17,6 +17,7 @@ models = [ ...@@ -17,6 +17,7 @@ models = [
batch_size=8, batch_size=8,
model_kwargs=dict(device_map='auto'), model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=2, num_procs=1) run_cfg=dict(num_gpus=2, num_procs=1)
) )
] ]
...@@ -17,6 +17,7 @@ models = [ ...@@ -17,6 +17,7 @@ models = [
batch_size=8, batch_size=8,
model_kwargs=dict(device_map='auto'), model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=1, num_procs=1) run_cfg=dict(num_gpus=1, num_procs=1)
) )
] ]
...@@ -17,6 +17,7 @@ models = [ ...@@ -17,6 +17,7 @@ models = [
batch_size=8, batch_size=8,
model_kwargs=dict(device_map='auto'), model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=2, num_procs=1) run_cfg=dict(num_gpus=2, num_procs=1)
) )
] ]
...@@ -17,6 +17,7 @@ models = [ ...@@ -17,6 +17,7 @@ models = [
batch_size=8, batch_size=8,
model_kwargs=dict(device_map='auto'), model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=4, num_procs=1) run_cfg=dict(num_gpus=4, num_procs=1)
) )
] ]
...@@ -17,6 +17,7 @@ models = [ ...@@ -17,6 +17,7 @@ models = [
batch_size=8, batch_size=8,
model_kwargs=dict(device_map='auto'), model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=1, num_procs=1) run_cfg=dict(num_gpus=1, num_procs=1)
) )
] ]
...@@ -17,6 +17,7 @@ models = [ ...@@ -17,6 +17,7 @@ models = [
batch_size=8, batch_size=8,
model_kwargs=dict(device_map='auto'), model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=1, num_procs=1) run_cfg=dict(num_gpus=1, num_procs=1)
) )
] ]
...@@ -17,6 +17,7 @@ models = [ ...@@ -17,6 +17,7 @@ models = [
batch_size=8, batch_size=8,
model_kwargs=dict(device_map='auto'), model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=1, num_procs=1) run_cfg=dict(num_gpus=1, num_procs=1)
) )
] ]
tydiqa_summary_groups = [] tydiqa_summary_groups = []
_tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai'] _tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai']
_tydiqa = ['tyidqa-goldp_' + s for s in _tydiqa] _tydiqa = ['tydiqa-goldp_' + s for s in _tydiqa]
tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa}) tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa})
...@@ -18,6 +18,7 @@ class CMBDataset(BaseDataset): ...@@ -18,6 +18,7 @@ class CMBDataset(BaseDataset):
for d in val_data: for d in val_data:
d['option_str'] = '\n'.join( d['option_str'] = '\n'.join(
[f'{k}. {v}' for k, v in d['option'].items() if len(v) > 1]) [f'{k}. {v}' for k, v in d['option'].items() if len(v) > 1])
d['answer'] = 'NULL'
val_dataset = Dataset.from_list(val_data) val_dataset = Dataset.from_list(val_data)
with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f: with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f:
...@@ -25,7 +26,6 @@ class CMBDataset(BaseDataset): ...@@ -25,7 +26,6 @@ class CMBDataset(BaseDataset):
for d in test_data: for d in test_data:
d['option_str'] = '\n'.join( d['option_str'] = '\n'.join(
[f'{k}. {v}' for k, v in d['option'].items() if len(v) > 1]) [f'{k}. {v}' for k, v in d['option'].items() if len(v) > 1])
d['answer'] = 'NULL'
test_dataset = Dataset.from_list(test_data) test_dataset = Dataset.from_list(test_data)
return DatasetDict({'val': val_dataset, 'test': test_dataset}) return DatasetDict({'val': val_dataset, 'test': test_dataset})
...@@ -7,6 +7,19 @@ from opencompass.registry import LOAD_DATASET ...@@ -7,6 +7,19 @@ from opencompass.registry import LOAD_DATASET
from .base import BaseDataset from .base import BaseDataset
@LOAD_DATASET.register_module()
class cmnliDataset(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
data.append(line)
return Dataset.from_list(data)
@LOAD_DATASET.register_module() @LOAD_DATASET.register_module()
class cmnliDataset_V2(BaseDataset): class cmnliDataset_V2(BaseDataset):
......
from datasets import Dataset, load_dataset from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from ..base import BaseDataset from ..base import BaseDataset
...@@ -28,3 +28,9 @@ class LongBenchlshtDataset(BaseDataset): ...@@ -28,3 +28,9 @@ class LongBenchlshtDataset(BaseDataset):
}) })
dataset[split] = Dataset.from_list(raw_data) dataset[split] = Dataset.from_list(raw_data)
return dataset return dataset
@TEXT_POSTPROCESSORS.register_module()
def lsht_postprocess(text: str) -> str:
text = text.lstrip('\n').split('\n')[0]
return text
from datasets import Dataset, load_dataset from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from ..base import BaseDataset from ..base import BaseDataset
...@@ -24,3 +24,9 @@ class LongBenchsamsumDataset(BaseDataset): ...@@ -24,3 +24,9 @@ class LongBenchsamsumDataset(BaseDataset):
}) })
dataset[split] = Dataset.from_list(raw_data) dataset[split] = Dataset.from_list(raw_data)
return dataset return dataset
@TEXT_POSTPROCESSORS.register_module()
def samsum_postprocess(text: str) -> str:
text = text.lstrip('\n').split('\n')[0]
return text
from datasets import Dataset, load_dataset from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from ..base import BaseDataset from ..base import BaseDataset
...@@ -28,3 +28,9 @@ class LongBenchtrecDataset(BaseDataset): ...@@ -28,3 +28,9 @@ class LongBenchtrecDataset(BaseDataset):
}) })
dataset[split] = Dataset.from_list(raw_data) dataset[split] = Dataset.from_list(raw_data)
return dataset return dataset
@TEXT_POSTPROCESSORS.register_module()
def trec_postprocess(text: str) -> str:
text = text.lstrip('\n').split('\n')[0]
return text
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment