Fix bugs in subjective evaluation (#589)

* rename * fix sub bugs and update docs * update * update

Fix bugs in subjective evaluation (#589)
* rename * fix sub bugs and update docs * update * update
14e6fe6f · Wei Jueqi · GitHub · c8cb38e8 · 14e6fe6f · 14e6fe6f
Unverified Commit 14e6fe6f authored Nov 14, 2023 by Wei Jueqi Committed by GitHub Nov 14, 2023
4 changed files
--- a/configs/subjective.py
+++ b/configs/subjective.py
 from mmengine.config import read_base
 with read_base():
+    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
+    from .models.chatglm.hf_chatglm2_6b import models as hf_chatglm2_6b
+    from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b
    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
    from .summarizers.subjective import summarizer

@@ -10,79 +13,7 @@ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask

-_meta_template = dict(
-    round=[
-        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(
-            role="BOT",
-            begin="\n<|im_start|>assistant\n",
-            end='<|im_end|>',
-            generate=True),
-    ], )
-
-_meta_template2 = dict(
-    round=[
-        dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
-        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
-    ], )
-
-models = [
-    dict(
-        type=HuggingFace,
-        abbr='chatglm2-6b-hf',
-        path='THUDM/chatglm2-6b',
-        tokenizer_path='THUDM/chatglm2-6b',
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto'),
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    ),
-    dict(
-        type=HuggingFaceCausalLM,
-        abbr='qwen-7b-chat-hf',
-        path="Qwen/Qwen-7B-Chat",
-        tokenizer_path='Qwen/Qwen-7B-Chat',
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,
-        ),
-        pad_token_id=151643,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=_meta_template,
-        model_kwargs=dict(device_map='auto', trust_remote_code=True),
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    ),
-    dict(
-        type=HuggingFaceCausalLM,
-        abbr='internlm-chat-7b-hf',
-        path="internlm/internlm-chat-7b",
-        tokenizer_path='internlm/internlm-chat-7b',
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            use_fast=False,
-            trust_remote_code=True),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=_meta_template2,
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto'),
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+models = [*hf_qwen_7b_chat, *hf_chatglm2_6b, *hf_internlm_chat_7b]

 api_meta_template = dict(
    round=[

--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@@ -93,7 +93,7 @@ eval = dict(
 ## 启动评测

 ```shell
-python run.py config/subjective.py -r
+python run.py configs/subjective.py -r
 ```

 `-r` 参数支持复用模型推理和 GPT4 评估结果。

--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -68,6 +68,7 @@ from .siqa import *  # noqa: F401, F403
 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator  # noqa: F401, F403
 from .storycloze import *  # noqa: F401, F403
 from .strategyqa import *  # noqa: F401, F403
+from .subjective_cmp import SubjectiveCmpDataset  # noqa: F401, F403
 from .summedits import *  # noqa: F401, F403
 from .summscreen import *  # noqa: F401, F403
 from .tabmwp import *  # noqa: F401, F403

--- a/opencompass/datasets/subjective_cmp.py
+++ b/opencompass/datasets/subjective_cmp.py
@@ -195,7 +195,7 @@ subjective_reader_cfg = dict(input_columns=[
                             train_split='test')

 subjective_all_sets = [
-    'sub_test',
+    'subjective_demo',
 ]