update datasets

36f11110 · cky · gaotong · 3cfe73de · 36f11110 · 36f11110
Commit 36f11110 authored Jul 05, 2023 by cky Committed by gaotong Jul 05, 2023
11 changed files
--- a/configs/models/classic/hf_llama.py
+++ b/configs/models/classic/hf_llama.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    # LLaMA 7B
+    dict(
+        type=HuggingFaceCausalLM,
+        path="decapoda-research/llama-7b-hf",
+        tokenizer_path='decapoda-research/llama-7b-hf',
+        tokenizer_kwargs=dict(padding_side='left',
+                              truncation_side='left',
+                              use_fast=False,
+                              ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto'),
+        batch_padding=False, # if false, inference with for-loop without batch padding 
+        run_cfg=dict(num_gpus=2, num_procs=1),
+    )
+]
--- a/configs/summarizers/groups/flores.py
+++ b/configs/summarizers/groups/flores.py
+flores_summary_groups = []
+_flores_lang_map = {
+    'Indo-European-Germanic': ['afr', 'dan', 'deu', 'isl', 'ltz', 'nld', 'nob', 'swe'],
+    'Indo-European-Romance': ['ast', 'cat', 'fra', 'glg', 'oci', 'por', 'ron', 'spa'],
+    'Indo-European-Slavic': ['bel', 'bos', 'bul', 'ces', 'hrv', 'mkd', 'pol', 'rus', 'slk', 'slv', 'srp', 'ukr'],
+    'Indo-European-Indo-Aryan': ['asm', 'ben', 'guj', 'hin', 'mar', 'npi', 'ory', 'pan', 'snd', 'urd'],
+    'Indo-European-Other': ['ckb', 'cym', 'ell', 'fas', 'gle', 'hye', 'ita', 'lav', 'lit', 'pus', 'tgk'],
+    'Austronesian': ['ceb', 'ind', 'jav', 'mri', 'msa', 'tgl'],
+    'Atlantic-Congo': ['ibo', 'kam', 'kea', 'lin', 'lug', 'nso', 'nya', 'sna', 'swh', 'umb', 'wol', 'xho', 'yor', 'zul'],
+    'Afro-Asiatic': ['amh', 'ara', 'ful', 'mlt', 'orm', 'som'],
+    'Turkic': ['azj', 'kaz', 'kir', 'tur', 'uzb'],
+    'Dravidian': ['kan', 'mal', 'tam', 'tel'],
+    'Sino-Tibetan': ['mya', 'zho_simpl', 'zho_trad'],
+    'Other': ['est', 'fin', 'hau', 'heb', 'hun', 'jpn', 'kat', 'khm', 'kor', 'lao', 'luo', 'mon', 'tha', 'vie'],
+}
+for _lang_serie in _flores_lang_map:
+    flores_summary_groups.append({
+        'name': f'flores_100_{_lang_serie}_English',
+        'subsets': [f'flores_100_{lang_name}-eng' for lang_name in _flores_lang_map[_lang_serie]]
+    })
+    flores_summary_groups.append({
+        'name': f'flores_100_English_{_lang_serie}',
+        'subsets': [f'flores_100_eng-{lang_name}' for lang_name in _flores_lang_map[_lang_serie]]
+    })
--- a/docs/en/prompt/meta_template.md
+++ b/docs/en/prompt/meta_template.md
+# Meta-Prompt
\ No newline at end of file
--- a/docs/en/user_guides/config.md
+++ b/docs/en/user_guides/config.md
+# Learn About Config
--- a/docs/zh_cn/_templates/404.html
+++ b/docs/zh_cn/_templates/404.html
+{% extends "layout.html" %}
+{% block body %}
+<h1>Page Not Found</h1>
+<p>
+  The page you are looking for cannot be found.
+</p>
+<p>
+  If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
+  the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
+</p>
+<!-- <p>
+  If you cannot find documentation you want, please <a
+    href="">open an issue</a> to tell us!
+</p> -->
+{% endblock %}
--- a/docs/zh_cn/prompt/prompt_template.md
+++ b/docs/zh_cn/prompt/prompt_template.md
+# Prompt 模板
\ No newline at end of file
--- a/docs/zh_cn/user_guides/models.md
+++ b/docs/zh_cn/user_guides/models.md
+# 准备模型
--- a/opencompass/datasets/cmnli.py
+++ b/opencompass/datasets/cmnli.py
+import json
+from datasets import Dataset
+from opencompass.registry import LOAD_DATASET
+from .base import BaseDataset
+@LOAD_DATASET.register_module()
+class cmnliDataset_V2(BaseDataset):
+    @staticmethod
+    def load(path):
+        data = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                if line['label'] == '-':
+                    continue
+                line['label'] = {
+                    'entailment': 'A',
+                    'contradiction': 'B',
+                    'neutral': 'C',
+                }[line['label']]
+                data.append(line)
+        return Dataset.from_list(data)
--- a/opencompass/datasets/xlsum.py
+++ b/opencompass/datasets/xlsum.py
+from datasets import concatenate_datasets, load_dataset
+from opencompass.registry import LOAD_DATASET
+from .base import BaseDataset
+@LOAD_DATASET.register_module()
+class XLSUMDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        path = kwargs.get('path', None)
+        lans = [
+            'oromo', 'french', 'amharic', 'arabic', 'azerbaijani', 'bengali',
+            'burmese', 'chinese_simplified', 'chinese_traditional', 'welsh',
+            'english', 'kirundi', 'gujarati', 'hausa', 'hindi', 'igbo',
+            'indonesian', 'japanese', 'korean', 'kyrgyz', 'marathi', 'spanish',
+            'scottish_gaelic', 'nepali', 'pashto', 'persian', 'pidgin',
+            'portuguese', 'punjabi', 'russian', 'serbian_cyrillic',
+            'serbian_latin', 'sinhala', 'somali', 'swahili', 'tamil', 'telugu',
+            'thai', 'tigrinya', 'turkish', 'ukrainian', 'urdu', 'uzbek',
+            'vietnamese', 'yoruba'
+        ]
+        datasets = []
+        for lan in lans:
+            dataset = load_dataset(path, lan)['validation']
+            datasets.append(dataset)
+        combined_dataset = concatenate_datasets(datasets)
+        return combined_dataset
--- a/opencompass/datasets/xsum.py
+++ b/opencompass/datasets/xsum.py
+import json
+from datasets import Dataset
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+from .base import BaseDataset
+@LOAD_DATASET.register_module()
+class XsumDataset(BaseDataset):
+    @staticmethod
+    def load(path: str):
+        with open(path, 'r', errors='ignore') as in_f:
+            rows = []
+            for i, line in enumerate(in_f):
+                if i == 1000:
+                    break
+                sample = json.loads(line.strip())
+                dialogue = sample['dialogue']
+                summary = sample['summary']
+                if isinstance(dialogue, float) or isinstance(summary, float):
+                    continue
+                rows.append({'dialogue': dialogue, 'summary': summary})
+            dataset = Dataset.from_dict({
+                'dialogue': [row['dialogue'] for row in rows],
+                'summary': [row['summary'] for row in rows]
+            })
+            return dataset
+@TEXT_POSTPROCESSORS.register_module('Xsum')
+def Xsum_postprocess(text: str) -> str:
+    text = text.strip().split('\n')[0].strip()
+    return text
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+"""Base Evaluator."""
+from typing import List
+class BaseEvaluator:
+    def __init__(self) -> None:
+        pass
+    def score(self):
+        raise NotImplementedError("Method hasn't been implemented yet")