Unverified Commit 689ffe5b authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Feature] Use dataset in local path (#570)

* update commonsenseqa

* update drop

* update flores_first100

* update gsm8k

* update humaneval

* update lambda

* update obqa

* update piqa

* update race

* update siqa

* update story_cloze

* update strategyqa

* update tydiqa

* update winogrande

* update doc

* update hellaswag

* fix obqa

* update collections

* update .zip name
parent d6aaac22
from datasets import load_dataset
import json
import os
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -9,14 +12,33 @@ from .base import BaseDataset
class commonsenseqaDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def pre_process(example):
for i in range(5):
example[chr(ord('A') + i)] = example['choices']['text'][i]
return example
def load(path):
dataset = {}
for split, stub in [
['train', 'train_rand_split.jsonl'],
['validation', 'dev_rand_split.jsonl'],
]:
data_path = os.path.join(path, stub)
dataset_list = []
with open(data_path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
dataset_list.append({
'question':
line['question']['stem'],
'A':
line['question']['choices'][0]['text'],
'B':
line['question']['choices'][1]['text'],
'C':
line['question']['choices'][2]['text'],
'D':
line['question']['choices'][3]['text'],
'E':
line['question']['choices'][4]['text'],
'answerKey':
line['answerKey'],
})
dataset[split] = Dataset.from_list(dataset_list)
dataset = dataset.map(pre_process).remove_columns(
['question_concept', 'id', 'choices'])
return dataset
return DatasetDict(dataset)
from datasets import DatasetDict, load_dataset
import json
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -9,21 +11,37 @@ from .base import BaseDataset
class dropDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs, split='validation')
def pre_process(example):
example['answers'] = example['answers_spans']['spans']
example['prompt'] = example.pop('passage')
return example
def only_number(example):
for i in example['answers_spans']['types']:
if i == 'number':
return True
return False
dataset = dataset.filter(only_number)
dataset = dataset.map(pre_process).remove_columns(
['section_id', 'query_id'])
return DatasetDict({'validation': dataset})
def get_answers(validated_answers):
answers = []
for answer_item in validated_answers:
if answer_item['number']:
answers.append(answer_item['number'])
elif any(answer_item['date'][i] for i in ['day', 'month', 'year']):
d = [answer_item['date'][i] for i in ['day', 'month', 'year']]
answers.append(' '.join(d).strip())
else:
for span in answer_item['spans']:
answers.append(span)
answers = list(set(answers))
return answers
@staticmethod
def load(path, only_number=True):
with open(path, 'r', encoding='utf-8') as f:
lines = json.load(f)
dataset_list = []
for line in lines.values():
for qa_pair in line['qa_pairs']:
validated_answers = qa_pair['validated_answers']
if only_number and not any(i['number']
for i in validated_answers):
continue
item = {
'prompt': line['passage'],
'question': qa_pair['question'],
'answers': dropDataset.get_answers(validated_answers),
}
dataset_list.append(item)
dataset_list = Dataset.from_list(dataset_list)
return DatasetDict({'validation': dataset_list})
import os
import re
from datasets import DatasetDict, load_dataset
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
......@@ -11,15 +12,30 @@ from .base import BaseDataset
class FloresFirst100Dataset(BaseDataset):
@staticmethod
def load(name):
return DatasetDict({
'dev':
load_dataset(path='facebook/flores', name=name, split='dev'),
'devtest':
load_dataset(path='facebook/flores',
name=name,
split='devtest[:100]')
})
def load_single(src_path, tgt_path, src_lang, tgt_lang):
with open(src_path, 'r', encoding='utf-8') as f:
src_lines = f.readlines()
with open(tgt_path, 'r', encoding='utf-8') as f:
tgt_lines = f.readlines()
assert len(src_lines) == len(tgt_lines)
dataset_list = [{
f'sentence_{src_lang}': src_lines[i].strip(),
f'sentence_{tgt_lang}': tgt_lines[i].strip(),
} for i in range(len(src_lines))]
return Dataset.from_list(dataset_list)
@staticmethod
def load(path, name):
src_lang, tgt_lang = name.split('-')
dev_dataset = FloresFirst100Dataset.load_single(
os.path.join(path, 'dev', f'{src_lang}.dev'),
os.path.join(path, 'dev', f'{tgt_lang}.dev'), src_lang, tgt_lang)
devtest_dataset = FloresFirst100Dataset.load_single(
os.path.join(path, 'devtest', f'{src_lang}.devtest'),
os.path.join(path, 'devtest', f'{tgt_lang}.devtest'), src_lang,
tgt_lang)
return DatasetDict({'dev': dev_dataset, 'devtest': devtest_dataset})
@TEXT_POSTPROCESSORS.register_module('flores')
......
import json
import os
from datasets import Dataset, DatasetDict
from opencompass.openicl import BaseEvaluator
from opencompass.registry import TEXT_POSTPROCESSORS
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset
@LOAD_DATASET.register_module()
class GSM8KDataset(BaseDataset):
@staticmethod
def load(path):
datasets = {}
for split in ['train', 'test']:
split_path = os.path.join(path, split + '.jsonl')
dataset = []
with open(split_path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line.strip())
line['answer']
dataset.append(line)
datasets[split] = Dataset.from_list(dataset)
return DatasetDict(datasets)
@TEXT_POSTPROCESSORS.register_module('gsm8k_dataset')
......
import json
from datasets import Dataset, load_dataset
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
......@@ -11,15 +11,20 @@ from .base import BaseDataset
class hellaswagDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
for i in range(4):
example[chr(ord('A') + i)] = example['endings'][i]
return example
dataset = dataset.map(preprocess).remove_columns(['endings'])
def load(path):
dataset = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
dataset.append({
'ctx': data['query'].split(': ', 2)[-1],
'A': data['choices'][0],
'B': data['choices'][1],
'C': data['choices'][2],
'D': data['choices'][3],
'label': data['gold'],
})
dataset = Dataset.from_list(dataset)
return dataset
......@@ -27,19 +32,20 @@ class hellaswagDataset(BaseDataset):
class hellaswagDataset_V2(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
for i in range(4):
example[chr(ord('A') + i)] = example['endings'][i]
if example['label']:
example['label'] = 'ABCD'[int(example['label'])]
else:
example['label'] = 'NULL'
return example
dataset = dataset.map(preprocess).remove_columns(['endings'])
def load(path):
dataset = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
dataset.append({
'ctx': data['query'].split(': ', 1)[-1],
'A': data['choices'][0],
'B': data['choices'][1],
'C': data['choices'][2],
'D': data['choices'][3],
'label': 'ABCD'[data['gold']],
})
dataset = Dataset.from_list(dataset)
return dataset
......
import json
import os.path as osp
import re
import tempfile
from typing import List
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class HumanevalDataset(BaseDataset):
@staticmethod
def load(path):
dataset = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line.strip()))
return Dataset.from_list(dataset)
class HumanEvaluator(BaseEvaluator):
......
import json
import re
import string
from datasets import DatasetDict, load_dataset
from datasets import Dataset, DatasetDict
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
......@@ -14,16 +15,12 @@ from .base import BaseDataset
class lambadaDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs, split='test')
def preprocess(example):
prompt, target = example['text'].strip().rsplit(' ', 1)
example['prompt'] = prompt
example['label'] = target
return example
dataset = dataset.map(preprocess)
def load(path):
dataset = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line))
dataset = Dataset.from_list(dataset)
return DatasetDict({'test': dataset})
......
from datasets import load_dataset
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
......@@ -9,33 +11,46 @@ from .base import BaseDataset
class OBQADataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def pre_process(example):
for i in range(4):
example[chr(ord('A') + i)] = example['choices']['text'][i]
return example
dataset = dataset.map(pre_process).remove_columns(['id', 'choices'])
return dataset
def load(path):
dataset_list = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
item = {
'A': line['question']['choices'][0]['text'],
'B': line['question']['choices'][1]['text'],
'C': line['question']['choices'][2]['text'],
'D': line['question']['choices'][3]['text'],
'question_stem': line['question']['stem'],
'answerKey': line['answerKey'],
}
if 'fact1' in line:
item['fact1'] = line['fact1']
dataset_list.append(item)
return Dataset.from_list(dataset_list)
@LOAD_DATASET.register_module()
class OBQADataset_V2(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def pre_process(example):
example['A'] = example['choices']['text'][0]
example['B'] = example['choices']['text'][1]
example['C'] = example['choices']['text'][2]
example['D'] = example['choices']['text'][3]
if not example['question_stem'].endswith('?'):
example['question_stem'] += ' what?'
return example
dataset = dataset.map(pre_process).remove_columns(['id', 'choices'])
return dataset
def load(path):
dataset_list = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
question = line['question']['stem']
if not question.endswith('?'):
question += ' what?'
item = {
'A': line['question']['choices'][0]['text'],
'B': line['question']['choices'][1]['text'],
'C': line['question']['choices'][2]['text'],
'D': line['question']['choices'][3]['text'],
'question_stem': question,
'answerKey': line['answerKey'],
}
if 'fact1' in line:
item['fact1'] = line['fact1']
dataset_list.append(item)
return Dataset.from_list(dataset_list)
from datasets import load_dataset
import json
import os
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -6,45 +9,100 @@ from .base import BaseDataset
@LOAD_DATASET.register_module()
class piqaDataset_V2(BaseDataset):
class piqaDataset(BaseDataset):
@staticmethod
def load_single(path, data_filename, label_filename):
data_path = os.path.join(path, data_filename)
label_path = os.path.join(path, label_filename)
dataset = []
with open(data_path, 'r', encoding='utf-8') as f:
data_lines = f.readlines()
with open(label_path, 'r', encoding='utf-8') as f:
label_lines = f.readlines()
assert len(data_lines) == len(label_lines)
for data, label in zip(data_lines, label_lines):
i = json.loads(data.strip())
i['label'] = int(label.strip())
dataset.append(i)
return Dataset.from_list(dataset)
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def load(path):
train_dataset = piqaDataset.load_single(path, 'train.jsonl',
'train-labels.lst')
val_dataset = piqaDataset.load_single(path, 'dev.jsonl',
'dev-labels.lst')
return DatasetDict({'train': train_dataset, 'validation': val_dataset})
@LOAD_DATASET.register_module()
class piqaDataset_V2(BaseDataset):
def preprocess(example):
assert isinstance(example['label'], int)
if example['label'] < 0:
example['answer'] = 'NULL'
@staticmethod
def load_single(path, data_filename, label_filename):
data_path = os.path.join(path, data_filename)
label_path = os.path.join(path, label_filename)
dataset = []
with open(data_path, 'r', encoding='utf-8') as f:
data_lines = f.readlines()
with open(label_path, 'r', encoding='utf-8') as f:
label_lines = f.readlines()
assert len(data_lines) == len(label_lines)
for data, label in zip(data_lines, label_lines):
i = json.loads(data.strip())
label = int(label.strip())
if label < 0:
i['answer'] = 'NULL'
else:
example['answer'] = 'AB'[example['label']]
example.pop('label')
return example
i['answer'] = 'AB'[label]
dataset.append(i)
dataset = dataset.map(preprocess)
return dataset
return Dataset.from_list(dataset)
@staticmethod
def load(path):
train_dataset = piqaDataset_V2.load_single(path, 'train.jsonl',
'train-labels.lst')
val_dataset = piqaDataset_V2.load_single(path, 'dev.jsonl',
'dev-labels.lst')
return DatasetDict({'train': train_dataset, 'validation': val_dataset})
@LOAD_DATASET.register_module()
class piqaDataset_V3(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
example['goal'] = example['goal'][0].upper() + example['goal'][1:]
if example['goal'].endswith('?') or example['goal'].endswith('.'):
example['sol1'] = example['sol1'][0].upper(
) + example['sol1'][1:]
example['sol2'] = example['sol2'][0].upper(
) + example['sol2'][1:]
def load_single(path, data_filename, label_filename):
data_path = os.path.join(path, data_filename)
label_path = os.path.join(path, label_filename)
dataset = []
with open(data_path, 'r', encoding='utf-8') as f:
data_lines = f.readlines()
with open(label_path, 'r', encoding='utf-8') as f:
label_lines = f.readlines()
assert len(data_lines) == len(label_lines)
for data, label in zip(data_lines, label_lines):
i = json.loads(data.strip())
i['label'] = int(label.strip())
# some preprocessing
i['goal'] = i['goal'][0].upper() + i['goal'][1:]
if i['goal'].endswith('?') or i['goal'].endswith('.'):
i['sol1'] = i['sol1'][0].upper() + i['sol1'][1:]
i['sol2'] = i['sol2'][0].upper() + i['sol2'][1:]
else:
example['sol1'] = example['sol1'][0].lower(
) + example['sol1'][1:]
example['sol2'] = example['sol2'][0].lower(
) + example['sol2'][1:]
return example
dataset = dataset.map(preprocess)
return dataset
i['sol1'] = i['sol1'][0].lower() + i['sol1'][1:]
i['sol2'] = i['sol2'][0].lower() + i['sol2'][1:]
dataset.append(i)
return Dataset.from_list(dataset)
@staticmethod
def load(path):
train_dataset = piqaDataset_V3.load_single(path, 'train.jsonl',
'train-labels.lst')
val_dataset = piqaDataset_V3.load_single(path, 'dev.jsonl',
'dev-labels.lst')
return DatasetDict({'train': train_dataset, 'validation': val_dataset})
from datasets import load_dataset
import json
import os
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -10,12 +13,21 @@ class RaceDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = load_dataset(path, name)
def preprocess(x):
for ans, option in zip(['A', 'B', 'C', 'D'], x['options']):
x[ans] = option
del x['options']
return x
return dataset.map(preprocess)
dataset = {}
for split in ['validation', 'test']:
jsonl_path = os.path.join(path, split, f'{name}.jsonl')
dataset_list = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
dataset_list.append({
'article': line['article'],
'question': line['question'],
'A': line['options'][0],
'B': line['options'][1],
'C': line['options'][2],
'D': line['options'][3],
'answer': line['answer'],
})
dataset[split] = Dataset.from_list(dataset_list)
return DatasetDict(dataset)
from datasets import load_dataset
import json
import os
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -6,24 +9,72 @@ from .base import BaseDataset
@LOAD_DATASET.register_module()
class siqaDataset_V2(BaseDataset):
class siqaDataset(BaseDataset):
"""Disconnect from HuggingFace version of HFDataset."""
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def load_single(path, data_filename, label_filename):
data_path = os.path.join(path, data_filename)
label_path = os.path.join(path, label_filename)
dataset = []
with open(data_path, 'r', encoding='utf-8') as f:
data_lines = f.readlines()
with open(label_path, 'r', encoding='utf-8') as f:
label_lines = f.readlines()
assert len(data_lines) == len(label_lines)
for data, label in zip(data_lines, label_lines):
i = json.loads(data.strip())
i['label'] = int(label.strip())
dataset.append(i)
def preprocess(example):
example['all_labels'] = {
return Dataset.from_list(dataset)
@staticmethod
def load(path):
train_dataset = siqaDataset.load_single(path, 'train.jsonl',
'train-labels.lst')
val_dataset = siqaDataset.load_single(path, 'dev.jsonl',
'dev-labels.lst')
return DatasetDict({'train': train_dataset, 'validation': val_dataset})
@LOAD_DATASET.register_module()
class siqaDataset_V2(BaseDataset):
"""Disconnect from HuggingFace version of siqaDataset_V2."""
@staticmethod
def load_single(path, data_filename, label_filename):
data_path = os.path.join(path, data_filename)
label_path = os.path.join(path, label_filename)
dataset = []
with open(data_path, 'r', encoding='utf-8') as f:
data_lines = f.readlines()
with open(label_path, 'r', encoding='utf-8') as f:
label_lines = f.readlines()
assert len(data_lines) == len(label_lines)
for data, label in zip(data_lines, label_lines):
i = json.loads(data.strip())
label = int(label.strip())
# some preprocessing
i['all_labels'] = {
'candidates': [
f'A. {example["answerA"]}',
f'B. {example["answerB"]}',
f'C. {example["answerC"]}',
[f'A. {i["answerA"]}', 'A', i['answerA']],
[f'B. {i["answerB"]}', 'B', i['answerB']],
[f'C. {i["answerC"]}', 'C', i['answerC']],
],
'label':
int(example['label']) - 1
label - 1
}
example['label'] = ' ABC'[int(example['label'])]
return example
i['label'] = ' ABC'[label]
dataset = dataset.map(preprocess)
return dataset
dataset.append(i)
return Dataset.from_list(dataset)
@staticmethod
def load(path):
train_dataset = siqaDataset_V2.load_single(path, 'train.jsonl',
'train-labels.lst')
val_dataset = siqaDataset_V2.load_single(path, 'dev.jsonl',
'dev-labels.lst')
return DatasetDict({'train': train_dataset, 'validation': val_dataset})
from datasets import DatasetDict, load_dataset
import json
import os
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -9,38 +12,39 @@ from .base import BaseDataset
class storyclozeDataset(BaseDataset):
@staticmethod
def load(**kwargs):
# special process
dataset = load_dataset(**kwargs, split='train+eval')
def preprocess(example):
example['context'] = ' '.join([
example['input_sentence_1'], example['input_sentence_2'],
example['input_sentence_3'], example['input_sentence_4']
def load(path, lang):
dataset_list = []
for split in ['train', 'eval']:
split_path = os.path.join(path, f'{lang}_{split}.jsonl')
with open(split_path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
line['context'] = ' '.join([
line['input_sentence_1'], line['input_sentence_2'],
line['input_sentence_3'], line['input_sentence_4']
])
return example
dataset = dataset.map(preprocess)
return DatasetDict({'test': dataset})
dataset_list.append(line)
dataset_list = Dataset.from_list(dataset_list)
return DatasetDict({'test': dataset_list})
@LOAD_DATASET.register_module()
class storyclozeDataset_V2(BaseDataset):
@staticmethod
def load(**kwargs):
# special process
dataset = load_dataset(**kwargs, split='train+eval')
def preprocess(example):
example['context'] = ' '.join([
example['input_sentence_1'], example['input_sentence_2'],
example['input_sentence_3'], example['input_sentence_4']
def load(path, lang):
dataset_list = []
for split in ['train', 'eval']:
split_path = os.path.join(path, f'{lang}_{split}.jsonl')
with open(split_path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
line['context'] = ' '.join([
line['input_sentence_1'], line['input_sentence_2'],
line['input_sentence_3'], line['input_sentence_4']
])
example['answer_right_ending'] = ' AB'[
example['answer_right_ending']]
return example
dataset = dataset.map(preprocess)
return dataset
line['answer_right_ending'] = ' AB'[
line['answer_right_ending']]
dataset_list.append(line)
dataset_list = Dataset.from_list(dataset_list)
return dataset_list
import json
import re
from opencompass.registry import TEXT_POSTPROCESSORS
from datasets import Dataset
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset
@TEXT_POSTPROCESSORS.register_module('strategyqa')
......@@ -16,3 +21,13 @@ def strategyqa_pred_postprocess(text: str) -> str:
@TEXT_POSTPROCESSORS.register_module('strategyqa_dataset')
def strategyqa_dataset_postprocess(text: str) -> str:
return 'yes' if str(text) == 'True' else 'no'
@LOAD_DATASET.register_module()
class StrategyQADataset(BaseDataset):
@staticmethod
def load(path):
with open(path, 'r', encoding='utf-8') as f:
dataset = json.load(f)
return Dataset.from_list(dataset)
import json
import os
import re
from collections import Counter
from datasets import load_dataset
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.utils.text_postprocessors import general_postprocess
......@@ -12,15 +14,16 @@ from .base import BaseDataset
class TydiQADataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def pre_process(example):
example['answer'] = example['answers']['text']
return example
dataset = dataset.map(pre_process).remove_columns(['id', 'answers'])
return dataset
def load(path, lang):
path = os.path.join(path, 'dev', f'{lang}-dev.jsonl')
dataset_list = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
answer = list(set([i['text'] for i in line['answers']]))
line['answer'] = answer
dataset_list.append(line)
return Dataset.from_list(dataset_list)
class TydiQAEvaluator(BaseEvaluator):
......
from datasets import load_dataset
import json
import os
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
......@@ -7,38 +10,49 @@ from .base import BaseDataset
@LOAD_DATASET.register_module()
class winograndeDataset(BaseDataset):
"""Disconnect from Huggingface, winograndeDataset."""
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
prompt = example.pop('sentence')
example['opt1'] = prompt.replace('_', example.pop('option1'))
example['opt2'] = prompt.replace('_', example.pop('option2'))
return example
return dataset.map(preprocess)
def load(path):
path = os.path.join(path, 'dev.jsonl')
dataset_list = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
prompt = line['sentence']
dataset_list.append({
'opt1':
prompt.replace('_', line['option1']),
'opt2':
prompt.replace('_', line['option2']),
'answer':
line['answer']
})
dataset_list = Dataset.from_list(dataset_list)
return dataset_list
@LOAD_DATASET.register_module()
class winograndeDataset_V2(BaseDataset):
"""Disconnect from Huggingface, winograndeDataset_V2."""
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
def preprocess(example):
prompt = example.pop('sentence')
example['opt1'] = prompt.replace('_', example.pop('option1'))
example['opt2'] = prompt.replace('_', example.pop('option2'))
answer = example.pop('answer')
if answer == '':
example['label'] = 'NULL'
else:
example['label'] = ' AB'[int(answer)]
return example
return dataset.map(preprocess)
def load(path):
path = os.path.join(path, 'dev.jsonl')
dataset_list = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
prompt = line['sentence']
answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
dataset_list.append({
'opt1':
prompt.replace('_', line['option1']),
'opt2':
prompt.replace('_', line['option2']),
'answer':
answer
})
dataset_list = Dataset.from_list(dataset_list)
return dataset_list
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment