Commit 36f11110 authored by cky's avatar cky Committed by gaotong
Browse files

update datasets

parent 3cfe73de
from opencompass.models import HuggingFaceCausalLM
models = [
# LLaMA 7B
dict(
type=HuggingFaceCausalLM,
path="decapoda-research/llama-7b-hf",
tokenizer_path='decapoda-research/llama-7b-hf',
tokenizer_kwargs=dict(padding_side='left',
truncation_side='left',
use_fast=False,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding
run_cfg=dict(num_gpus=2, num_procs=1),
)
]
flores_summary_groups = []
_flores_lang_map = {
'Indo-European-Germanic': ['afr', 'dan', 'deu', 'isl', 'ltz', 'nld', 'nob', 'swe'],
'Indo-European-Romance': ['ast', 'cat', 'fra', 'glg', 'oci', 'por', 'ron', 'spa'],
'Indo-European-Slavic': ['bel', 'bos', 'bul', 'ces', 'hrv', 'mkd', 'pol', 'rus', 'slk', 'slv', 'srp', 'ukr'],
'Indo-European-Indo-Aryan': ['asm', 'ben', 'guj', 'hin', 'mar', 'npi', 'ory', 'pan', 'snd', 'urd'],
'Indo-European-Other': ['ckb', 'cym', 'ell', 'fas', 'gle', 'hye', 'ita', 'lav', 'lit', 'pus', 'tgk'],
'Austronesian': ['ceb', 'ind', 'jav', 'mri', 'msa', 'tgl'],
'Atlantic-Congo': ['ibo', 'kam', 'kea', 'lin', 'lug', 'nso', 'nya', 'sna', 'swh', 'umb', 'wol', 'xho', 'yor', 'zul'],
'Afro-Asiatic': ['amh', 'ara', 'ful', 'mlt', 'orm', 'som'],
'Turkic': ['azj', 'kaz', 'kir', 'tur', 'uzb'],
'Dravidian': ['kan', 'mal', 'tam', 'tel'],
'Sino-Tibetan': ['mya', 'zho_simpl', 'zho_trad'],
'Other': ['est', 'fin', 'hau', 'heb', 'hun', 'jpn', 'kat', 'khm', 'kor', 'lao', 'luo', 'mon', 'tha', 'vie'],
}
for _lang_serie in _flores_lang_map:
flores_summary_groups.append({
'name': f'flores_100_{_lang_serie}_English',
'subsets': [f'flores_100_{lang_name}-eng' for lang_name in _flores_lang_map[_lang_serie]]
})
flores_summary_groups.append({
'name': f'flores_100_English_{_lang_serie}',
'subsets': [f'flores_100_eng-{lang_name}' for lang_name in _flores_lang_map[_lang_serie]]
})
# Meta-Prompt
\ No newline at end of file
{% extends "layout.html" %}
{% block body %}
<h1>Page Not Found</h1>
<p>
The page you are looking for cannot be found.
</p>
<p>
If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
</p>
<!-- <p>
If you cannot find documentation you want, please <a
href="">open an issue</a> to tell us!
</p> -->
{% endblock %}
# Prompt 模板
\ No newline at end of file
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class cmnliDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
if line['label'] == '-':
continue
line['label'] = {
'entailment': 'A',
'contradiction': 'B',
'neutral': 'C',
}[line['label']]
data.append(line)
return Dataset.from_list(data)
from datasets import concatenate_datasets, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class XLSUMDataset(BaseDataset):
@staticmethod
def load(**kwargs):
path = kwargs.get('path', None)
lans = [
'oromo', 'french', 'amharic', 'arabic', 'azerbaijani', 'bengali',
'burmese', 'chinese_simplified', 'chinese_traditional', 'welsh',
'english', 'kirundi', 'gujarati', 'hausa', 'hindi', 'igbo',
'indonesian', 'japanese', 'korean', 'kyrgyz', 'marathi', 'spanish',
'scottish_gaelic', 'nepali', 'pashto', 'persian', 'pidgin',
'portuguese', 'punjabi', 'russian', 'serbian_cyrillic',
'serbian_latin', 'sinhala', 'somali', 'swahili', 'tamil', 'telugu',
'thai', 'tigrinya', 'turkish', 'ukrainian', 'urdu', 'uzbek',
'vietnamese', 'yoruba'
]
datasets = []
for lan in lans:
dataset = load_dataset(path, lan)['validation']
datasets.append(dataset)
combined_dataset = concatenate_datasets(datasets)
return combined_dataset
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset
@LOAD_DATASET.register_module()
class XsumDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path, 'r', errors='ignore') as in_f:
rows = []
for i, line in enumerate(in_f):
if i == 1000:
break
sample = json.loads(line.strip())
dialogue = sample['dialogue']
summary = sample['summary']
if isinstance(dialogue, float) or isinstance(summary, float):
continue
rows.append({'dialogue': dialogue, 'summary': summary})
dataset = Dataset.from_dict({
'dialogue': [row['dialogue'] for row in rows],
'summary': [row['summary'] for row in rows]
})
return dataset
@TEXT_POSTPROCESSORS.register_module('Xsum')
def Xsum_postprocess(text: str) -> str:
text = text.strip().split('\n')[0].strip()
return text
"""Base Evaluator."""
from typing import List
class BaseEvaluator:
def __init__(self) -> None:
pass
def score(self):
raise NotImplementedError("Method hasn't been implemented yet")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment