Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
32f40a8f
"configs/vscode:/vscode.git/clone" did not exist on "c1f6bbab5582957d8ddfb487f9df02e81560e93e"
Unverified
Commit
32f40a8f
authored
Jan 08, 2024
by
Fengzhe Zhou
Committed by
GitHub
Jan 08, 2024
Browse files
[Sync] Sync with internal codes 2023.01.08 (#777)
parent
8194199d
Changes
118
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
783 additions
and
12 deletions
+783
-12
configs/models/wizardlm/hf_wizardlm_70b_v1_0.py
configs/models/wizardlm/hf_wizardlm_70b_v1_0.py
+33
-0
configs/models/wizardlm/hf_wizardlm_7b_v1_0.py
configs/models/wizardlm/hf_wizardlm_7b_v1_0.py
+33
-0
configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
+24
-0
configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
+25
-0
configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
+24
-0
configs/models/yi/hf_yi_34b_200k.py
configs/models/yi/hf_yi_34b_200k.py
+10
-10
configs/models/yi/hf_yi_34b_chat.py
configs/models/yi/hf_yi_34b_chat.py
+32
-0
configs/models/yi/hf_yi_6b_200k.py
configs/models/yi/hf_yi_6b_200k.py
+33
-0
configs/models/yi/hf_yi_6b_chat.py
configs/models/yi/hf_yi_6b_chat.py
+32
-0
configs/models/zephyr/hf_zephyr_7b_beta.py
configs/models/zephyr/hf_zephyr_7b_beta.py
+32
-0
configs/models/zephyr/vllm_zephyr_7b_beta.py
configs/models/zephyr/vllm_zephyr_7b_beta.py
+23
-0
configs/summarizers/agent_bench.py
configs/summarizers/agent_bench.py
+61
-0
configs/summarizers/cibench.py
configs/summarizers/cibench.py
+33
-0
configs/summarizers/code_passk.py
configs/summarizers/code_passk.py
+51
-0
configs/summarizers/compass_knowledge.py
configs/summarizers/compass_knowledge.py
+38
-0
configs/summarizers/compass_math.py
configs/summarizers/compass_math.py
+42
-0
configs/summarizers/compassbench_v1_language.py
configs/summarizers/compassbench_v1_language.py
+72
-0
configs/summarizers/compassbench_v1_reason.py
configs/summarizers/compassbench_v1_reason.py
+44
-0
configs/summarizers/groups/cibench.py
configs/summarizers/groups/cibench.py
+107
-2
configs/summarizers/groups/plugineval.py
configs/summarizers/groups/plugineval.py
+34
-0
No files found.
configs/models/wizardlm/hf_wizardlm_70b_v1_0.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'USER: '
,
end
=
' '
),
dict
(
role
=
"BOT"
,
begin
=
"ASSISTANT: "
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFaceCausalLM
,
abbr
=
'wizardlm-70b-v1.0-hf'
,
path
=
'WizardLM/WizardLM-70B-V1.0'
,
tokenizer_path
=
'WizardLM/WizardLM-70B-V1.0'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
4
,
num_procs
=
1
),
end_str
=
'</s>'
,
)
]
configs/models/wizardlm/hf_wizardlm_7b_v1_0.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response:"
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFaceCausalLM
,
abbr
=
'wizardlm-7b-v1.0-hf'
,
path
=
'WizardLM/WizardLM-7B-V1.0'
,
tokenizer_path
=
'WizardLM/WizardLM-7B-V1.0'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'</s>'
,
)
]
configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
VLLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'USER: '
,
end
=
' '
),
dict
(
role
=
"BOT"
,
begin
=
"ASSISTANT: "
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
VLLM
,
abbr
=
'wizardlm-13b-v1.2-vllm'
,
path
=
'WizardLM/WizardLM-13B-V1.2'
,
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
32
,
generation_kwargs
=
dict
(
temperature
=
0
),
end_str
=
'</s>'
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
VLLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'USER: '
,
end
=
' '
),
dict
(
role
=
"BOT"
,
begin
=
"ASSISTANT: "
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
VLLM
,
abbr
=
'wizardlm-70b-v1.0-vllm'
,
path
=
'WizardLM/WizardLM-70B-V1.0'
,
model_kwargs
=
dict
(
tensor_parallel_size
=
4
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
32
,
generation_kwargs
=
dict
(
temperature
=
0
),
end_str
=
'</s>'
,
run_cfg
=
dict
(
num_gpus
=
4
,
num_procs
=
1
),
)
]
configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
VLLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response:"
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
VLLM
,
abbr
=
'wizardlm-7b-v1.0-vllm'
,
path
=
'WizardLM/WizardLM-7B-V1.0'
,
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
32
,
generation_kwargs
=
dict
(
temperature
=
0
),
end_str
=
'</s>'
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/
wizardlm/hf_wizardlm_7b
.py
→
configs/models/
yi/hf_yi_34b_200k
.py
View file @
32f40a8f
from
opencompass.models
import
HuggingFace
CausalLM
from
opencompass.models
import
HuggingFace
models
=
[
dict
(
type
=
HuggingFaceCausalLM
,
abbr
=
'wizardlm-7b-hf'
,
path
=
'TheBloke/wizardLM-7B-HF'
,
tokenizer_path
=
'TheBloke/wizardLM-7B-HF'
,
type
=
HuggingFace
,
abbr
=
'yi-34b-200k-hf'
,
path
=
'01-ai/Yi-34B-200K'
,
tokenizer_path
=
'01-ai/Yi-34B-200K'
,
model_kwargs
=
dict
(
trust_remote_code
=
True
,
device_map
=
'auto'
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
...
...
@@ -15,10 +19,6 @@ models = [
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
run_cfg
=
dict
(
num_gpus
=
4
,
num_procs
=
1
),
)
]
configs/models/yi/hf_yi_34b_chat.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
HuggingFace
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'<|im_start|>user
\n
'
,
end
=
'<|im_end|>
\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"<|im_start|>assistant
\n
"
,
end
=
'<|im_end|>
\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFace
,
abbr
=
'yi-34b-chat-hf'
,
path
=
'01-ai/Yi-34B-Chat'
,
tokenizer_path
=
'01-ai/Yi-34B-Chat'
,
model_kwargs
=
dict
(
trust_remote_code
=
True
,
device_map
=
'auto'
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
4
,
num_procs
=
1
),
end_str
=
'<|im_end|>'
,
)
]
configs/models/yi/hf_yi_6b_200k.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
HuggingFace
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response:"
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFace
,
abbr
=
'yi-6b-200k-hf'
,
path
=
'01-ai/Yi-6B-200K'
,
tokenizer_path
=
'01-ai/Yi-6B-200K'
,
model_kwargs
=
dict
(
trust_remote_code
=
True
,
device_map
=
'auto'
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'</s>'
,
)
]
configs/models/yi/hf_yi_6b_chat.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
HuggingFace
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'<|im_start|>user
\n
'
,
end
=
'<|im_end|>
\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"<|im_start|>assistant
\n
"
,
end
=
'<|im_end|>
\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFace
,
abbr
=
'yi-6b-chat-hf'
,
path
=
'01-ai/Yi-6B-Chat'
,
tokenizer_path
=
'01-ai/Yi-6B-Chat'
,
model_kwargs
=
dict
(
trust_remote_code
=
True
,
device_map
=
'auto'
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<|im_end|>'
,
)
]
configs/models/zephyr/hf_zephyr_7b_beta.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
HuggingFace
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'<|user|>
\n
'
,
end
=
'</s>'
),
dict
(
role
=
"BOT"
,
begin
=
"<|assistant|>
\n
"
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFace
,
abbr
=
'zephyr-7b-beta-hf'
,
path
=
'HuggingFaceH4/zephyr-7b-beta'
,
tokenizer_path
=
'HuggingFaceH4/zephyr-7b-beta'
,
model_kwargs
=
dict
(
trust_remote_code
=
True
,
device_map
=
'auto'
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'</s>'
,
)
]
configs/models/zephyr/vllm_zephyr_7b_beta.py
0 → 100644
View file @
32f40a8f
from
opencompass.models
import
VLLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'<|user|>
\n
'
,
end
=
'</s>'
),
dict
(
role
=
"BOT"
,
begin
=
"<|assistant|>
\n
"
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
VLLM
,
abbr
=
'zephyr-7b-beta-vllm'
,
path
=
'HuggingFaceH4/zephyr-7b-beta'
,
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
32
,
generation_kwargs
=
dict
(
temperature
=
0
),
end_str
=
'</s>'
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/summarizers/agent_bench.py
0 → 100644
View file @
32f40a8f
from
mmengine.config
import
read_base
with
read_base
():
from
.groups.cibench
import
cibench_summary_groups
from
.groups.plugineval
import
plugineval_summary_groups
agent_summary_groups
=
[
dict
(
name
=
'math_acc_1_and_fill_in_blank-native'
,
subsets
=
[[
'compassbench_v1_math-high-single_choice_cn-native'
,
'acc_1'
],
[
'compassbench_v1_math-high-single_choice_en-native'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_cn-native'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_en-native'
,
'acc_1'
],
[
'compassbench_v1_math-primary-cloze_cn-native'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en-native'
,
'accuracy'
]]),
dict
(
name
=
'math_perf_4_and_fill_in_blank-native'
,
subsets
=
[[
'compassbench_v1_math-high-single_choice_cn-native'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en-native'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn-native'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en-native'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn-native'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en-native'
,
'accuracy'
]]),
dict
(
name
=
'math_acc_1_and_fill_in_blank-agent'
,
subsets
=
[[
'compassbench_v1_math-high-single_choice_cn-agent'
,
'acc_1'
],
[
'compassbench_v1_math-high-single_choice_en-agent'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_cn-agent'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_en-agent'
,
'acc_1'
],
[
'compassbench_v1_math-primary-cloze_cn-agent'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en-agent'
,
'accuracy'
]]),
dict
(
name
=
'math_perf_4_and_fill_in_blank-agent'
,
subsets
=
[[
'compassbench_v1_math-high-single_choice_cn-agent'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en-agent'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn-agent'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en-agent'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn-agent'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en-agent'
,
'accuracy'
]]),
dict
(
name
=
'agent'
,
subsets
=
[
'math_perf_4_and_fill_in_blank-agent'
,
'cibench_template_wo_nltk:executable'
,
'cibench_template_wo_nltk:numeric_correct'
,
'cibench_template_wo_nltk:vis_sim'
,
'cibench_template_cn_wo_nltk:executable'
,
'cibench_template_cn_wo_nltk:numeric_correct'
,
'cibench_template_cn_wo_nltk:vis_sim'
,
'plugin_eval-p10'
],
weights
=
{
'math_perf_4_and_fill_in_blank-agent'
:
1
,
'cibench_template_wo_nltk:executable'
:
0.5
,
'cibench_template_wo_nltk:numeric_correct'
:
0.25
,
'cibench_template_wo_nltk:vis_sim'
:
0.25
,
'cibench_template_cn_wo_nltk:executable'
:
0.5
,
'cibench_template_cn_wo_nltk:numeric_correct'
:
0.25
,
'cibench_template_cn_wo_nltk:vis_sim'
:
0.25
,
'plugin_eval-p10'
:
1
}
)
]
summarizer
=
dict
(
dataset_abbrs
=
[
'agent'
,
'math_acc_1_and_fill_in_blank-native'
,
'math_perf_4_and_fill_in_blank-native'
,
# '######## MathBench-Agent Accuracy ########', # category
'math_acc_1_and_fill_in_blank-agent'
,
'math_perf_4_and_fill_in_blank-agent'
,
# '######## CIBench Template ########', # category
'cibench_template:executable'
,
'cibench_template:numeric_correct'
,
'cibench_template:text_score'
,
'cibench_template:vis_sim'
,
# '######## CIBench Template Chinese ########', # category
'cibench_template_cn:executable'
,
'cibench_template_cn:numeric_correct'
,
'cibench_template_cn:text_score'
,
'cibench_template_cn:vis_sim'
,
# '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable'
,
'cibench_template_wo_nltk:numeric_correct'
,
'cibench_template_wo_nltk:vis_sim'
,
# '######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable'
,
'cibench_template_cn_wo_nltk:numeric_correct'
,
'cibench_template_cn_wo_nltk:vis_sim'
,
# '######## T-Eval ########', # category
[
'plugin_eval-p10'
,
'naive_average'
],
[
'plugin_eval-p10-instruct_v1'
,
'format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-p10-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-p10-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-p10-reason_str_v2'
,
'thought'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'thought'
],
[
'plugin_eval-p10-retrieve_str_v2'
,
'name'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'name'
],
[
'plugin_eval-p10-understand_str_v2'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'args'
],
[
'plugin_eval-p10-review_str_v6'
,
'review_quality'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
configs/summarizers/cibench.py
0 → 100644
View file @
32f40a8f
from
mmengine.config
import
read_base
with
read_base
():
from
.groups.cibench
import
cibench_summary_groups
summarizer
=
dict
(
dataset_abbrs
=
[
'######## CIBench Generation ########'
,
# category
[
'cibench'
,
'executable'
],
[
'cibench'
,
'general_correct'
],
[
'cibench'
,
'vis_sim'
],
'######## CIBench Template ########'
,
# category
'cibench_template:executable'
,
'cibench_template:numeric_correct'
,
'cibench_template:text_score'
,
'cibench_template:vis_sim'
,
'######## CIBench Template Chinese ########'
,
# category
'cibench_template_cn:executable'
,
'cibench_template_cn:numeric_correct'
,
'cibench_template_cn:text_score'
,
'cibench_template_cn:vis_sim'
,
'######## CIBench Template w/o NLTK ########'
,
# category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable'
,
'cibench_template_wo_nltk:numeric_correct'
,
'cibench_template_wo_nltk:vis_sim'
,
'######## CIBench Template Chinese w/o NLTK ########'
,
# category
'cibench_template_cn_wo_nltk:executable'
,
'cibench_template_cn_wo_nltk:numeric_correct'
,
'cibench_template_cn_wo_nltk:vis_sim'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
configs/summarizers/code_passk.py
0 → 100644
View file @
32f40a8f
code_passk_summary_groups
=
[
# rename
{
'name'
:
'humaneval_pass@1(greedy)'
,
'subsets'
:
[[
'openai_humaneval'
,
'humaneval_pass@1'
]]},
{
'name'
:
'humaneval_pass@10'
,
'subsets'
:
[[
'openai_humaneval_passk'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_pass@10'
,
'subsets'
:
[[
'openai_humaneval_repeat10'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_cn_pass@1(greedy)'
,
'subsets'
:
[[
'openai_humaneval_cn'
,
'humaneval_pass@1'
]]},
{
'name'
:
'humaneval_cn_pass@10'
,
'subsets'
:
[[
'openai_humaneval_cn_passk'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_cn_pass@10'
,
'subsets'
:
[[
'openai_humaneval_cn_repeat10'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_plus_pass@1(greedy)'
,
'subsets'
:
[[
'humaneval_plus'
,
'humaneval_plus_pass@1'
]]},
{
'name'
:
'humaneval_plus_pass@10'
,
'subsets'
:
[[
'humaneval_plus_passk'
,
'humaneval_plus_pass@10'
]]},
{
'name'
:
'humaneval_plus_pass@10'
,
'subsets'
:
[[
'humaneval_plus_repeat10'
,
'humaneval_plus_pass@10'
]]},
{
'name'
:
'mbpp_pass@1(greedy)'
,
'subsets'
:
[[
'mbpp'
,
'score'
]]},
{
'name'
:
'mbpp_pass@10'
,
'subsets'
:
[[
'mbpp_passk'
,
'pass@10'
]]},
{
'name'
:
'mbpp_pass@10'
,
'subsets'
:
[[
'mbpp_repeat10'
,
'pass@10'
]]},
{
'name'
:
'mbpp_cn_pass@1(greedy)'
,
'subsets'
:
[[
'mbpp_cn'
,
'score'
]]},
{
'name'
:
'mbpp_cn_pass@10'
,
'subsets'
:
[[
'mbpp_cn_passk'
,
'pass@10'
]]},
{
'name'
:
'mbpp_cn_pass@10'
,
'subsets'
:
[[
'mbpp_cn_repeat10'
,
'pass@10'
]]},
{
'name'
:
'sanitized_mbpp_pass@1(greedy)'
,
'subsets'
:
[[
'sanitized_mbpp'
,
'score'
]]},
{
'name'
:
'sanitized_mbpp_pass@10'
,
'subsets'
:
[[
'sanitized_mbpp_passk'
,
'pass@10'
]]},
{
'name'
:
'sanitized_mbpp_pass@10'
,
'subsets'
:
[[
'sanitized_mbpp_repeat10'
,
'pass@10'
]]},
# real add
{
'name'
:
'humanevalx'
,
'subsets'
:
[
'humanevalx-python'
,
'humanevalx-cpp'
,
'humanevalx-go'
,
'humanevalx-java'
,
'humanevalx-js'
]},
{
'name'
:
'code'
,
'subsets'
:
[
'humaneval_plus_pass@1(greedy)'
,
'sanitized_mbpp_pass@1(greedy)'
,
'humaneval_cn_pass@1(greedy)'
,
'mbpp_cn_pass@1(greedy)'
,
'humanevalx'
]}
]
summarizer
=
dict
(
dataset_abbrs
=
[
'code'
,
'humaneval_pass@1(greedy)'
,
'humaneval_pass@10'
,
'humaneval_cn_pass@1(greedy)'
,
'humaneval_cn_pass@10'
,
'humaneval_plus_pass@1(greedy)'
,
'humaneval_plus_pass@10'
,
'mbpp_pass@1(greedy)'
,
'mbpp_pass@10'
,
'mbpp_cn_pass@1(greedy)'
,
'mbpp_cn_pass@10'
,
'sanitized_mbpp_pass@1(greedy)'
,
'sanitized_mbpp_pass@10'
,
'humanevalx'
,
'humanevalx-python'
,
'humanevalx-cpp'
,
'humanevalx-go'
,
'humanevalx-java'
,
'humanevalx-js'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
configs/summarizers/compass_knowledge.py
0 → 100644
View file @
32f40a8f
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names
=
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'compassbench_v1_knowledge-engineering-single_choice_cn_circular'
,
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
]
compassbench_v1_knowledge_groups
=
[
{
'name'
:
'knowledge_cn'
,
'subsets'
:
compassbench_v1_knowledge_names
},
{
'name'
:
'knowledge_acc_1_and_cloze'
,
'subsets'
:
[[
'knowledge_cn'
,
'acc_1'
],
[
'compassbench_v1_knowledge-mixed-cloze_en'
,
'score'
]]},
{
'name'
:
'knowledge_perf_4_and_cloze'
,
'subsets'
:
[[
'knowledge_cn'
,
'perf_4'
],
[
'compassbench_v1_knowledge-mixed-cloze_en'
,
'score'
]]},
]
'compassbench_v1_knowledge-mixed-cloze_en'
summarizer
=
dict
(
dataset_abbrs
=
[
'knowledge_acc_1_and_cloze'
,
[
'knowledge_cn'
,
'acc_1'
],
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-engineering-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'acc_1'
],
'compassbench_v1_knowledge-mixed-cloze_en'
,
'knowledge_perf_4_and_cloze'
,
[
'knowledge_cn'
,
'perf_4'
],
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-engineering-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'perf_4'
],
'compassbench_v1_knowledge-mixed-cloze_en'
,
],
summary_groups
=
compassbench_v1_knowledge_groups
)
configs/summarizers/compass_math.py
0 → 100644
View file @
32f40a8f
# This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`
compassbench_v1_math_groups
=
[
{
'name'
:
'math_acc_1_and_fill_in_blank'
,
'subsets'
:
[
[
'compassbench_v1_math-high-single_choice_cn'
,
'acc_1'
],
[
'compassbench_v1_math-high-single_choice_en'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'acc_1'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
],
]},
{
'name'
:
'math_perf_4_and_fill_in_blank'
,
'subsets'
:
[
[
'compassbench_v1_math-high-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
],
]},
]
summarizer
=
dict
(
dataset_abbrs
=
[
'math_acc_1_and_fill_in_blank'
,
[
'compassbench_v1_math-high-single_choice_cn'
,
'acc_1'
],
[
'compassbench_v1_math-high-single_choice_en'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'acc_1'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
],
'math_perf_4_and_fill_in_blank'
,
[
'compassbench_v1_math-high-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
],
],
summary_groups
=
compassbench_v1_math_groups
,
)
configs/summarizers/compassbench_v1_language.py
0 → 100644
View file @
32f40a8f
compassbench_v1_language_names
=
[
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
[
'intention_recognition_en_circular'
,
'acc_origin'
],
[
'intention_recognition_en_circular'
,
'perf_circular'
],
[
'intention_recognition_zh_circular'
,
'acc_origin'
],
[
'intention_recognition_zh_circular'
,
'perf_circular'
],
[
'sentiment_analysis_en_circular'
,
'acc_origin'
],
[
'sentiment_analysis_en_circular'
,
'perf_circular'
],
[
'sentiment_analysis_zh_circular'
,
'acc_origin'
],
[
'sentiment_analysis_zh_circular'
,
'perf_circular'
],
[
'translation'
,
'score'
],
[
'content_critic_en_circular'
,
'acc_origin'
],
[
'content_critic_en_circular'
,
'perf_circular'
],
[
'content_critic_zh_circular'
,
'acc_origin'
],
[
'content_critic_zh_circular'
,
'perf_circular'
],
[
'content_summarization_en'
,
'rouge1'
],
[
'content_summarization_zh'
,
'rouge1'
],
[
'traditional_cultural_understanding_zh_circular'
,
'acc_origin'
],
[
'traditional_cultural_understanding_zh_circular'
,
'perf_circular'
],
[
'chinese_semantic_understanding_zh_circular'
,
'acc_origin'
],
[
'chinese_semantic_understanding_zh_circular'
,
'perf_circular'
],
]
compassbench_v1_language_groups
=
[
{
'name'
:
'language_zh_acc_1_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_zh'
in
name
and
metric
!=
'perf_circular'
]},
{
'name'
:
'language_en_acc_1_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_en'
in
name
and
metric
!=
'perf_circular'
]},
{
'name'
:
'language_acc_1_and_non_mcq'
,
'subsets'
:
[
'language_zh_acc_1_and_non_mcq'
,
'language_en_acc_1_and_non_mcq'
]},
{
'name'
:
'language_zh_perf_4_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_zh'
in
name
and
metric
!=
'acc_origin'
]},
{
'name'
:
'language_en_perf_4_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_en'
in
name
and
metric
!=
'acc_origin'
]},
{
'name'
:
'language_perf_4_and_non_mcq'
,
'subsets'
:
[
'language_zh_perf_4_and_non_mcq'
,
'language_en_perf_4_and_non_mcq'
]},
]
summarizer
=
dict
(
dataset_abbrs
=
[
'language_acc_1_and_non_mcq'
,
'language_en_acc_1_and_non_mcq'
,
'language_zh_acc_1_and_non_mcq'
,
[
'information_retrieval_en'
,
'score'
],
[
'information_retrieval_zh'
,
'score'
],
[
'intention_recognition_en_circular'
,
'acc_origin'
],
[
'intention_recognition_zh_circular'
,
'acc_origin'
],
[
'sentiment_analysis_en_circular'
,
'acc_origin'
],
[
'sentiment_analysis_zh_circular'
,
'acc_origin'
],
[
'translation'
,
'score'
],
[
'content_critic_en_circular'
,
'acc_origin'
],
[
'content_critic_zh_circular'
,
'acc_origin'
],
[
'content_summarization_en'
,
'rouge1'
],
[
'content_summarization_zh'
,
'rouge1'
],
[
'traditional_cultural_understanding_zh_circular'
,
'acc_origin'
],
[
'chinese_semantic_understanding_zh_circular'
,
'acc_origin'
],
'language_perf_4_and_non_mcq'
,
'language_en_perf_4_and_non_mcq'
,
'language_zh_perf_4_and_non_mcq'
,
[
'information_retrieval_en'
,
'score'
],
[
'information_retrieval_zh'
,
'score'
],
[
'intention_recognition_en_circular'
,
'perf_circular'
],
[
'intention_recognition_zh_circular'
,
'perf_circular'
],
[
'sentiment_analysis_en_circular'
,
'perf_circular'
],
[
'sentiment_analysis_zh_circular'
,
'perf_circular'
],
[
'translation'
,
'score'
],
[
'content_critic_en_circular'
,
'perf_circular'
],
[
'content_critic_zh_circular'
,
'perf_circular'
],
[
'content_summarization_en'
,
'rouge1'
],
[
'content_summarization_zh'
,
'rouge1'
],
[
'traditional_cultural_understanding_zh_circular'
,
'perf_circular'
],
[
'chinese_semantic_understanding_zh_circular'
,
'perf_circular'
],
],
summary_groups
=
compassbench_v1_language_groups
,
)
configs/summarizers/compassbench_v1_reason.py
0 → 100644
View file @
32f40a8f
compassbench_v1_reason_groups
=
[
{
'name'
:
'reasonbench_cn_logic_circular'
,
'subsets'
:
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'reasonbench_cn_inductive_deer_translated_circular'
,
'reasonbench_cn_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench_en_logic_circular'
,
'subsets'
:
[
'reasonbench_en_abductive_alphanlg_circular'
,
'reasonbench_en_deductive_bbh7obj_circular'
,
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'reasonbench_en_deductive_ocnli_translated_circular'
,
'reasonbench_en_inductive_deer_circular'
,
'reasonbench_en_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench'
,
'subsets'
:
[
'reasonbench_cn_commonsense_circular'
,
'reasonbench_cn_logic_circular'
,
'reasonbench_en_commonsense_circular'
,
'reasonbench_en_logic_circular'
]},
]
summarizer
=
dict
(
dataset_abbrs
=
[
[
'reasonbench'
,
'acc_origin'
],
[
'reasonbench_cn_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_en_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_cn_logic_circular'
,
'acc_origin'
],
[
'reasonbench_en_logic_circular'
,
'acc_origin'
],
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'acc_origin'
],
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_inductive_selfgenerated_circular'
,
'acc_origin'
],
[
'reasonbench_en_abductive_alphanlg_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_bbh7obj_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_ocnli_translated_circular'
,
'acc_origin'
],
[
'reasonbench_en_inductive_deer_circular'
,
'acc_origin'
],
[
'reasonbench_en_inductive_selfgenerated_circular'
,
'acc_origin'
],
[
'reasonbench'
,
'perf_circular'
],
[
'reasonbench_cn_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_en_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_cn_logic_circular'
,
'perf_circular'
],
[
'reasonbench_en_logic_circular'
,
'perf_circular'
],
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_selfgenerated_circular'
,
'perf_circular'
],
[
'reasonbench_en_abductive_alphanlg_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_bbh7obj_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_ocnli_translated_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_deer_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_selfgenerated_circular'
,
'perf_circular'
],
],
summary_groups
=
compassbench_v1_reason_groups
,
)
configs/summarizers/groups/cibench.py
View file @
32f40a8f
_cibench
=
[
'Pandas'
,
'Matplotlib'
,
'Opencv'
,
'SciPy'
,
'Seaborn'
,
'PyTorch'
]
_cibench
=
[
'cibench_generation_'
+
i
for
i
in
_cibench
]
cibench_summary_groups
=
[{
'name'
:
'cibench_generation'
,
'subsets'
:
_cibench
}]
_cibench
=
[
'cibench_'
+
i
for
i
in
_cibench
]
cibench_summary_groups
=
[{
'name'
:
'cibench'
,
'subsets'
:
_cibench
}]
_cibench_template
=
[
'lightgbm'
,
'matplotlib'
,
'nltk'
,
'opencv'
,
'pandas'
,
'pytorch'
,
'scipy'
,
'seaborn'
,
'sklearn'
,
'tensorflow'
]
_cibench_template
=
[
'cibench_template/'
+
i
for
i
in
_cibench_template
]
# number of total exec questions in this module
_cibench_template_weight
=
{
'lightgbm'
:
[
30
,
15
,
0
,
0
],
'matplotlib'
:
[
42
,
0
,
0
,
36
],
'nltk'
:
[
70
,
30
,
20
,
10
],
'opencv'
:
[
60
,
10
,
0
,
40
],
'pandas'
:
[
60
,
40
,
0
,
10
],
'pytorch'
:
[
28
,
0
,
0
,
0
],
'scipy'
:
[
60
,
40
,
0
,
0
],
'seaborn'
:
[
42
,
0
,
0
,
35
],
'sklearn'
:
[
42
,
6
,
0
,
18
],
'tensorflow'
:
[
36
,
6
,
0
,
12
],
}
cibench_summary_groups
.
extend
([
{
'name'
:
'cibench_template:executable'
,
'subsets'
:
[[
i
,
'executable'
]
for
i
in
_cibench_template
],
'weights'
:
{
'cibench_template/'
+
k
:
v
[
0
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
{
'name'
:
'cibench_template:numeric_correct'
,
'subsets'
:
[[
i
,
'numeric_correct'
]
for
i
in
_cibench_template
],
'weights'
:
{
'cibench_template/'
+
k
:
v
[
1
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
{
'name'
:
'cibench_template:text_score'
,
'subsets'
:
[[
i
,
'text_score'
]
for
i
in
_cibench_template
],
'weights'
:
{
'cibench_template/'
+
k
:
v
[
2
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
{
'name'
:
'cibench_template:vis_sim'
,
'subsets'
:
[[
i
,
'vis_sim'
]
for
i
in
_cibench_template
],
'weights'
:
{
'cibench_template/'
+
k
:
v
[
3
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
])
## chinese
_cibench_template_cn
=
[
'lightgbm'
,
'matplotlib'
,
'nltk'
,
'opencv'
,
'pandas'
,
'pytorch'
,
'scipy'
,
'seaborn'
,
'sklearn'
,
'tensorflow'
]
_cibench_template_cn
=
[
'cibench_template_chinese/'
+
i
for
i
in
_cibench_template_cn
]
cibench_summary_groups
.
extend
([
{
'name'
:
'cibench_template_cn:executable'
,
'subsets'
:
[[
i
,
'executable'
]
for
i
in
_cibench_template_cn
],
'weights'
:
{
'cibench_template_chinese/'
+
k
:
v
[
0
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
{
'name'
:
'cibench_template_cn:numeric_correct'
,
'subsets'
:
[[
i
,
'numeric_correct'
]
for
i
in
_cibench_template_cn
],
'weights'
:
{
'cibench_template_chinese/'
+
k
:
v
[
1
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
{
'name'
:
'cibench_template_cn:text_score'
,
'subsets'
:
[[
i
,
'text_score'
]
for
i
in
_cibench_template_cn
],
'weights'
:
{
'cibench_template_chinese/'
+
k
:
v
[
2
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
{
'name'
:
'cibench_template_cn:vis_sim'
,
'subsets'
:
[[
i
,
'vis_sim'
]
for
i
in
_cibench_template_cn
],
'weights'
:
{
'cibench_template_chinese/'
+
k
:
v
[
3
]
for
k
,
v
in
_cibench_template_weight
.
items
()},
},
])
## add more without nltk
cibench_summary_groups
.
extend
([
{
'name'
:
'cibench_template_wo_nltk:executable'
,
'subsets'
:
[[
i
,
'executable'
]
for
i
in
_cibench_template
if
'nltk'
not
in
i
],
'weights'
:
{
'cibench_template/'
+
k
:
v
[
0
]
for
k
,
v
in
_cibench_template_weight
.
items
()
if
'nltk'
not
in
k
},
},
{
'name'
:
'cibench_template_wo_nltk:numeric_correct'
,
'subsets'
:
[[
i
,
'numeric_correct'
]
for
i
in
_cibench_template
if
'nltk'
not
in
i
],
'weights'
:
{
'cibench_template/'
+
k
:
v
[
1
]
for
k
,
v
in
_cibench_template_weight
.
items
()
if
'nltk'
not
in
k
},
},
{
'name'
:
'cibench_template_wo_nltk:vis_sim'
,
'subsets'
:
[[
i
,
'vis_sim'
]
for
i
in
_cibench_template
if
'nltk'
not
in
i
],
'weights'
:
{
'cibench_template/'
+
k
:
v
[
3
]
for
k
,
v
in
_cibench_template_weight
.
items
()
if
'nltk'
not
in
k
},
},
])
cibench_summary_groups
.
extend
([
{
'name'
:
'cibench_template_cn_wo_nltk:executable'
,
'subsets'
:
[[
i
,
'executable'
]
for
i
in
_cibench_template_cn
if
'nltk'
not
in
i
],
'weights'
:
{
'cibench_template_chinese/'
+
k
:
v
[
0
]
for
k
,
v
in
_cibench_template_weight
.
items
()
if
'nltk'
not
in
k
},
},
{
'name'
:
'cibench_template_cn_wo_nltk:numeric_correct'
,
'subsets'
:
[[
i
,
'numeric_correct'
]
for
i
in
_cibench_template_cn
if
'nltk'
not
in
i
],
'weights'
:
{
'cibench_template_chinese/'
+
k
:
v
[
1
]
for
k
,
v
in
_cibench_template_weight
.
items
()
if
'nltk'
not
in
k
},
},
{
'name'
:
'cibench_template_cn_wo_nltk:vis_sim'
,
'subsets'
:
[[
i
,
'vis_sim'
]
for
i
in
_cibench_template_cn
if
'nltk'
not
in
i
],
'weights'
:
{
'cibench_template_chinese/'
+
k
:
v
[
3
]
for
k
,
v
in
_cibench_template_weight
.
items
()
if
'nltk'
not
in
k
},
},
])
configs/summarizers/groups/plugineval.py
View file @
32f40a8f
...
...
@@ -31,4 +31,38 @@ plugineval_summary_groups = [
[
'plugin_eval-review_str_v6'
,
'review_quality'
],
]
},
# special treatment for first 10% data points
{
'name'
:
'plugin_eval-p10-instruct_v1'
,
'metric'
:
'format_metric'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'string_format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'json_format_metric'
],
]
},
{
'name'
:
'plugin_eval-p10-instruct_v1'
,
'metric'
:
'args_em_metric'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'string_args_em_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'json_args_em_metric'
],
]
},
{
'name'
:
'plugin_eval-p10'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-p10-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-p10-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-p10-reason_str_v2'
,
'thought'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'thought'
],
[
'plugin_eval-p10-retrieve_str_v2'
,
'name'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'name'
],
[
'plugin_eval-p10-understand_str_v2'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'args'
],
[
'plugin_eval-p10-review_str_v6'
,
'review_quality'
],
]
},
]
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment