[Fix] Quick fix (#995)

0665bb91 · bittersweet1999 · GitHub · 1d319855 · 0665bb91 · 0665bb91
Unverified Commit 0665bb91 authored Mar 22, 2024 by bittersweet1999 Committed by GitHub Mar 22, 2024
6 changed files
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@@ -3,7 +3,7 @@ from mmengine.config import read_base
 with read_base():
    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets

-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@@ -51,26 +51,14 @@ models = [

 datasets = [*subjective_datasets]

-infer = dict(
-    partitioner=dict(type=NaivePartitioner),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llmeval',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=OpenICLInferTask),
-    ),
-)
-
 # -------------Evalation Stage ----------------------------------------

 ## ------------- JudgeLLM Configuration
 judge_model = dict(
    abbr='GPT4-Turbo',
-    type=OpenAIAllesAPIN,
+    type=OpenAI,
    path='gpt-4-1106-preview',
    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    url='xxxx',
    meta_template=api_meta_template,
    query_per_second=16,
    max_out_len=2048,

--- a/configs/eval_subjective_alpacaeval.py
+++ b/configs/eval_subjective_alpacaeval.py
@@ -68,16 +68,7 @@ gpt4 = dict(
    temperature=1,
 )  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions

-infer = dict(
-    partitioner=dict(type=NaivePartitioner),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llmeval',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=OpenICLInferTask),
-    ),
-)
+

 # -------------Evalation Stage ----------------------------------------


--- a/configs/eval_subjective_compassarena.py
+++ b/configs/eval_subjective_compassarena.py
@@ -69,17 +69,6 @@ gpt4 = dict(
    temperature=1,
 )  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions

-infer = dict(
-    partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llm_dev2',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=OpenICLInferTask),
-    ),
-)
-
 # -------------Evalation Stage ----------------------------------------

 ## ------------- JudgeLLM Configuration

--- a/configs/eval_subjective_corev2.py
+++ b/configs/eval_subjective_corev2.py
-from mmengine.config import read_base
-
-with read_base():
-    from .datasets.subjective.subjective_cmp.subjective_corev2 import subjective_datasets
-
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.partitioners import NaivePartitioner, SizePartitioner
-from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
-from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
-from opencompass.runners import LocalRunner
-from opencompass.runners import SlurmSequentialRunner
-from opencompass.tasks import OpenICLInferTask
-from opencompass.tasks.subjective_eval import SubjectiveEvalTask
-from opencompass.summarizers import Corev2Summarizer
-
-api_meta_template = dict(
-    round=[
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ],
-    reserved_roles=[
-        dict(role='SYSTEM', api_role='SYSTEM'),
-    ],
-)
-
-# -------------Inference Stage ----------------------------------------
-
-# For subjective evaluation, we often set do sample for models
-models = [
-    dict(
-        type=HuggingFaceChatGLM3,
-        abbr='chatglm3-6b-hf',
-        path='THUDM/chatglm3-6b',
-        tokenizer_path='THUDM/chatglm3-6b',
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True,
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        generation_kwargs=dict(
-            do_sample=True,
-        ),
-        meta_template=api_meta_template,
-        max_out_len=2048,
-        max_seq_len=4096,
-        batch_size=1,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
-
-datasets = [*subjective_datasets]
-
-gpt4 = dict(
-    abbr='gpt4-turbo',
-    type=OpenAI,
-    path='gpt-4-1106-preview',
-    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=1,
-    max_out_len=2048,
-    max_seq_len=4096,
-    batch_size=4,
-    retry=20,
-    temperature=1,
-)  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
-
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=500),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llm_dev2',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=OpenICLInferTask),
-    ),
-)
-
-# -------------Evalation Stage ----------------------------------------
-
-## ------------- JudgeLLM Configuration
-judge_model = dict(
-    abbr='GPT4-Turbo',
-    type=OpenAI,
-    path='gpt-4-1106-preview',
-    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=1,
-    max_out_len=1024,
-    max_seq_len=4096,
-    batch_size=2,
-    retry=20,
-    temperature=0,
-)
-
-## ------------- Evaluation Configuration
-eval = dict(
-    partitioner=dict(
-        type=SubjectiveSizePartitioner, mode='m2n', max_task_size=500, base_models=[gpt4], compare_models=models
-    ),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llm_dev2',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
-    ),
-)
-
-summarizer = dict(type=Corev2Summarizer, match_method='smart')
-
-work_dir = 'outputs/corev2/'
--- a/configs/eval_subjective_creationbench.py
+++ b/configs/eval_subjective_creationbench.py
@@ -3,7 +3,7 @@ from mmengine.config import read_base
 with read_base():
    from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets

-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@@ -51,26 +51,14 @@ models = [

 datasets = [*subjective_datasets]

-infer = dict(
-    partitioner=dict(type=NaivePartitioner),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llmeval',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=OpenICLInferTask),
-    ),
-)
-
 # -------------Evalation Stage ----------------------------------------

 ## ------------- JudgeLLM Configuration
 judge_model = dict(
    abbr='GPT4-Turbo',
-    type=OpenAIAllesAPIN,
+    type=OpenAI,
    path='gpt-4-1106-preview',
    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    url='xxxx',
    meta_template=api_meta_template,
    query_per_second=16,
    max_out_len=2048,

--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@@ -4,7 +4,7 @@ with read_base():
    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
    # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets

-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@@ -59,26 +59,14 @@ models = [

 datasets = [*subjective_datasets]

-infer = dict(
-    partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llm_dev2',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=OpenICLInferTask),
-    ),
-)
-
 # -------------Evalation Stage ----------------------------------------

 ## ------------- JudgeLLM Configuration
 judge_model = dict(
    abbr='GPT4-Turbo',
-    type=OpenAIAllesAPIN,
+    type=OpenAI,
    path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    url='xxxx',
    meta_template=api_meta_template,
    query_per_second=16,
    max_out_len=2048,