Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
b4afe3e7
Unverified
Commit
b4afe3e7
authored
Jan 17, 2024
by
Fengzhe Zhou
Committed by
GitHub
Jan 17, 2024
Browse files
[Sync] Add InternLM2 Keyset Evaluation Demo (#807)
Co-authored-by:
zhangyifan1
<
zhangyifan1@pjlab.org.cn
>
parent
acae5609
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
444 additions
and
57 deletions
+444
-57
configs/models/others/hf_abel_7b_002.py
configs/models/others/hf_abel_7b_002.py
+31
-0
configs/models/others/hf_arithmo_mistral_7b.py
configs/models/others/hf_arithmo_mistral_7b.py
+33
-0
configs/models/others/hf_gsm8k_rft_llama7b2_u13b.py
configs/models/others/hf_gsm8k_rft_llama7b2_u13b.py
+33
-0
configs/models/others/hf_metamath_7b_v1_0.py
configs/models/others/hf_metamath_7b_v1_0.py
+33
-0
configs/models/others/hf_metamath_llemma_7b.py
configs/models/others/hf_metamath_llemma_7b.py
+33
-0
configs/models/others/hf_metamath_mistral_7b.py
configs/models/others/hf_metamath_mistral_7b.py
+33
-0
configs/models/others/hf_phi_2.py
configs/models/others/hf_phi_2.py
+24
-0
configs/models/others/hf_telechat_7b_chat.py
configs/models/others/hf_telechat_7b_chat.py
+34
-0
configs/models/others/hf_yayi2_30b_base.py
configs/models/others/hf_yayi2_30b_base.py
+25
-0
configs/models/wizardlm/hf_wizardmath_7b_v1_0.py
configs/models/wizardlm/hf_wizardmath_7b_v1_0.py
+33
-0
configs/models/wizardlm/hf_wizardmath_7b_v1_1.py
configs/models/wizardlm/hf_wizardmath_7b_v1_1.py
+33
-0
configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
+1
-1
configs/summarizers/agent_bench.py
configs/summarizers/agent_bench.py
+22
-9
configs/summarizers/groups/leval.py
configs/summarizers/groups/leval.py
+3
-0
configs/summarizers/groups/longbench.py
configs/summarizers/groups/longbench.py
+10
-0
configs/summarizers/groups/mathbench.py
configs/summarizers/groups/mathbench.py
+1
-1
configs/summarizers/groups/mathbench_agent.py
configs/summarizers/groups/mathbench_agent.py
+1
-1
configs/summarizers/groups/plugineval.py
configs/summarizers/groups/plugineval.py
+38
-42
configs/summarizers/internlm2_keyset.py
configs/summarizers/internlm2_keyset.py
+20
-0
configs/summarizers/leval.py
configs/summarizers/leval.py
+3
-3
No files found.
configs/models/others/hf_abel_7b_002.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'Question:
\n
'
,
end
=
'
\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"Answer:
\n
"
,
end
=
'
\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
abbr
=
'abel-7b-002'
,
type
=
HuggingFaceCausalLM
,
path
=
'GAIR/Abel-7B-002'
,
tokenizer_path
=
'GAIR/Abel-7B-002'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/others/hf_arithmo_mistral_7b.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
begin
=
''
,
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'Question: '
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"Answer: "
,
end
=
'
\n\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
abbr
=
'arithmo-mistral-7b-hf'
,
type
=
HuggingFaceCausalLM
,
path
=
'akjindal53244/Arithmo-Mistral-7B'
,
tokenizer_path
=
'akjindal53244/Arithmo-Mistral-7B'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/others/hf_gsm8k_rft_llama7b2_u13b.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
begin
=
'Below is an instruction that describes a task. Write a response that appropriately completes the request.
\n\n
'
,
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'### Instruction:
\n
'
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response:"
,
end
=
'
\n\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
abbr
=
'gsm8k-rft-llama7b2-u13b'
,
type
=
HuggingFaceCausalLM
,
path
=
'OFA-Sys/gsm8k-rft-llama7b2-u13b'
,
tokenizer_path
=
'OFA-Sys/gsm8k-rft-llama7b2-u13b'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/others/hf_metamath_7b_v1_0.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
begin
=
"Below is an instruction that describes a task. Write a response that appropriately completes the request.
\n\n
"
,
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'### Instruction:
\n
'
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response: "
,
end
=
'
\n\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
abbr
=
'metamath-7b-v1.0-hf'
,
type
=
HuggingFaceCausalLM
,
path
=
'meta-math/MetaMath-7B-V1.0'
,
tokenizer_path
=
'meta-math/MetaMath-7B-V1.0'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/others/hf_metamath_llemma_7b.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
begin
=
"Below is an instruction that describes a task. Write a response that appropriately completes the request.
\n\n
"
,
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'### Instruction:
\n
'
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response: "
,
end
=
'
\n\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
abbr
=
'metamath-llemma-7b-hf'
,
type
=
HuggingFaceCausalLM
,
path
=
'meta-math/MetaMath-Llemma-7B'
,
tokenizer_path
=
'meta-math/MetaMath-Llemma-7B'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/others/hf_metamath_mistral_7b.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
begin
=
"Below is an instruction that describes a task. Write a response that appropriately completes the request.
\n\n
"
,
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'### Instruction:
\n
'
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response: "
,
end
=
'
\n\n
'
,
generate
=
True
),
],
)
models
=
[
dict
(
abbr
=
'metamath-mistral-7b-hf'
,
type
=
HuggingFaceCausalLM
,
path
=
'meta-math/MetaMath-Mistral-7B'
,
tokenizer_path
=
'meta-math/MetaMath-Mistral-7B'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/others/hf_phi_2.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
models
=
[
dict
(
type
=
HuggingFaceCausalLM
,
abbr
=
'phi-2-hf'
,
path
=
'microsoft/phi-2'
,
tokenizer_path
=
'microsoft/phi-2'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
max_out_len
=
100
,
min_out_len
=
3
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
)
]
configs/models/others/hf_telechat_7b_chat.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
begin
=
'<_user>'
),
dict
(
role
=
"BOT"
,
begin
=
"<_bot>"
,
end
=
'<_end>'
,
generate
=
True
),
],
eos_token_id
=
160133
)
models
=
[
dict
(
abbr
=
'telechat-7b-hf'
,
type
=
HuggingFaceCausalLM
,
path
=
'Tele-AI/telechat-7B'
,
tokenizer_path
=
'Tele-AI/telechat-7B'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<_end>'
,
)
]
configs/models/others/hf_yayi2_30b_base.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
models
=
[
dict
(
abbr
=
'yayi2-30b-hf'
,
type
=
HuggingFaceCausalLM
,
path
=
'wenge-research/yayi2-30b'
,
tokenizer_path
=
'wenge-research/yayi2-30b'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
max_out_len
=
100
,
min_out_len
=
3
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
4
,
num_procs
=
1
),
)
]
configs/models/wizardlm/hf_wizardmath_7b_v1_0.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response:"
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFaceCausalLM
,
abbr
=
'wizardmath-7b-v1.0-hf'
,
path
=
'WizardLM/WizardMath-7B-V1.0'
,
tokenizer_path
=
'WizardLM/WizardMath-7B-V1.0'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'</s>'
,
)
]
configs/models/wizardlm/hf_wizardmath_7b_v1_1.py
0 → 100644
View file @
b4afe3e7
from
opencompass.models
import
HuggingFaceCausalLM
_meta_template
=
dict
(
round
=
[
dict
(
role
=
"HUMAN"
,
end
=
'
\n\n
'
),
dict
(
role
=
"BOT"
,
begin
=
"### Response:"
,
end
=
'</s>'
,
generate
=
True
),
],
)
models
=
[
dict
(
type
=
HuggingFaceCausalLM
,
abbr
=
'wizardmath-7b-v1.1-hf'
,
path
=
'WizardLM/WizardMath-7B-V1.1'
,
tokenizer_path
=
'WizardLM/WizardMath-7B-V1.1'
,
model_kwargs
=
dict
(
device_map
=
'auto'
,
trust_remote_code
=
True
,
),
tokenizer_kwargs
=
dict
(
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
),
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'</s>'
,
)
]
configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
View file @
b4afe3e7
...
@@ -16,7 +16,7 @@ models = [
...
@@ -16,7 +16,7 @@ models = [
meta_template
=
_meta_template
,
meta_template
=
_meta_template
,
max_out_len
=
100
,
max_out_len
=
100
,
max_seq_len
=
2048
,
max_seq_len
=
2048
,
batch_size
=
32
,
batch_size
=
1
,
generation_kwargs
=
dict
(
temperature
=
0
),
generation_kwargs
=
dict
(
temperature
=
0
),
end_str
=
'</s>'
,
end_str
=
'</s>'
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
...
...
configs/summarizers/agent_bench.py
View file @
b4afe3e7
...
@@ -11,8 +11,8 @@ agent_summary_groups = [
...
@@ -11,8 +11,8 @@ agent_summary_groups = [
dict
(
name
=
'math_perf_4_and_fill_in_blank-agent'
,
subsets
=
[[
'compassbench_v1_math-high-single_choice_cn-agent'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en-agent'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn-agent'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en-agent'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn-agent'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en-agent'
,
'accuracy'
]]),
dict
(
name
=
'math_perf_4_and_fill_in_blank-agent'
,
subsets
=
[[
'compassbench_v1_math-high-single_choice_cn-agent'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en-agent'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn-agent'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en-agent'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn-agent'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en-agent'
,
'accuracy'
]]),
dict
(
dict
(
name
=
'agent'
,
name
=
'agent'
,
subsets
=
[
'math_perf_4_and_fill_in_blank-agent'
,
'cibench_template_wo_nltk:executable'
,
'cibench_template_wo_nltk:numeric_correct'
,
'cibench_template_wo_nltk:vis_sim'
,
'cibench_template_cn_wo_nltk:executable'
,
'cibench_template_cn_wo_nltk:numeric_correct'
,
'cibench_template_cn_wo_nltk:vis_sim'
,
'plugin_eval-p10'
],
subsets
=
[
'math_perf_4_and_fill_in_blank-agent'
,
'cibench_template_wo_nltk:executable'
,
'cibench_template_wo_nltk:numeric_correct'
,
'cibench_template_wo_nltk:vis_sim'
,
'cibench_template_cn_wo_nltk:executable'
,
'cibench_template_cn_wo_nltk:numeric_correct'
,
'cibench_template_cn_wo_nltk:vis_sim'
,
'plugin_eval-p10'
,
'plugin_eval-p10_zh'
],
weights
=
{
'math_perf_4_and_fill_in_blank-agent'
:
1
,
'cibench_template_wo_nltk:executable'
:
0.5
,
'cibench_template_wo_nltk:numeric_correct'
:
0.25
,
'cibench_template_wo_nltk:vis_sim'
:
0.25
,
'cibench_template_cn_wo_nltk:executable'
:
0.5
,
'cibench_template_cn_wo_nltk:numeric_correct'
:
0.25
,
'cibench_template_cn_wo_nltk:vis_sim'
:
0.25
,
'plugin_eval-p10'
:
1
}
weights
=
{
'math_perf_4_and_fill_in_blank-agent'
:
1
,
'cibench_template_wo_nltk:executable'
:
0.5
,
'cibench_template_wo_nltk:numeric_correct'
:
0.25
,
'cibench_template_wo_nltk:vis_sim'
:
0.25
,
'cibench_template_cn_wo_nltk:executable'
:
0.5
,
'cibench_template_cn_wo_nltk:numeric_correct'
:
0.25
,
'cibench_template_cn_wo_nltk:vis_sim'
:
0.25
,
'plugin_eval-p10'
:
1
,
'plugin_eval-p10_zh'
:
1
}
)
)
]
]
...
@@ -48,13 +48,26 @@ summarizer = dict(
...
@@ -48,13 +48,26 @@ summarizer = dict(
[
'plugin_eval-p10-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-p10-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-p10-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-p10-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-p10-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-p10-reason_str_v2'
,
'thought'
],
[
'plugin_eval-p10-reason_str_v1'
,
'thought'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'thought'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1'
,
'thought'
],
[
'plugin_eval-p10-retrieve_str_v2'
,
'name'
],
[
'plugin_eval-p10-retrieve_str_v1'
,
'name'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'name'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1'
,
'name'
],
[
'plugin_eval-p10-understand_str_v2'
,
'args'
],
[
'plugin_eval-p10-understand_str_v1'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1'
,
'args'
],
[
'plugin_eval-p10-review_str_v6'
,
'review_quality'
],
[
'plugin_eval-p10-review_str_v1'
,
'review_quality'
],
[
'plugin_eval-p10_zh'
,
'naive_average'
],
[
'plugin_eval-p10-instruct_v1_zh'
,
'format_metric'
],
[
'plugin_eval-p10-instruct_v1_zh'
,
'args_em_metric'
],
[
'plugin_eval-p10-plan_str_v1_zh'
,
'f1_score'
],
[
'plugin_eval-p10-plan_json_v1_zh'
,
'f1_score'
],
[
'plugin_eval-p10-reason_str_v1_zh'
,
'thought'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1_zh'
,
'thought'
],
[
'plugin_eval-p10-retrieve_str_v1_zh'
,
'name'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1_zh'
,
'name'
],
[
'plugin_eval-p10-understand_str_v1_zh'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1_zh'
,
'args'
],
[
'plugin_eval-p10-review_str_v1_zh'
,
'review_quality'
],
],
],
summary_groups
=
sum
(
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
...
...
configs/summarizers/groups/leval.py
0 → 100644
View file @
b4afe3e7
leval_summary_groups
=
[
{
"name"
:
"leval"
,
"subsets"
:
[
"LEval_coursera"
,
"LEval_gsm100"
,
"LEval_quality"
,
"LEval_tpo"
,
"LEval_topic_retrieval"
,
"LEval_financialqa"
,
"LEval_gov_report_summ"
,
"LEval_legal_contract_qa"
,
"LEval_meeting_summ"
,
"LEval_multidocqa"
,
"LEval_narrativeqa"
,
"LEval_nq"
,
"LEval_news_summ"
,
"LEval_paper_assistant"
,
"LEval_patent_summ"
,
"LEval_review_summ"
,
"LEval_scientificqa"
,
"LEval_tvshow_summ"
]},
]
configs/summarizers/groups/longbench.py
0 → 100644
View file @
b4afe3e7
longbench_summary_groups
=
[
{
'name'
:
'longbench_single-document-qa'
,
'subsets'
:
[
'LongBench_narrativeqa'
,
'LongBench_qasper'
,
'LongBench_multifieldqa_en'
,
'LongBench_multifieldqa_zh'
]},
{
'name'
:
'longbench_multi-document-qa'
,
'subsets'
:
[
'LongBench_hotpotqa'
,
'LongBench_2wikimqa'
,
'LongBench_musique'
,
'LongBench_dureader'
]},
{
'name'
:
'longbench_summarization'
,
'subsets'
:
[
'LongBench_gov_report'
,
'LongBench_qmsum'
,
'LongBench_multi_news'
,
'LongBench_vcsum'
]},
{
'name'
:
'longbench_few-shot-learning'
,
'subsets'
:
[
'LongBench_trec'
,
'LongBench_triviaqa'
,
'LongBench_samsum'
,
'LongBench_lsht'
]},
{
'name'
:
'longbench_synthetic-tasks'
,
'subsets'
:
[
'LongBench_passage_count'
,
'LongBench_passage_retrieval_en'
,
'LongBench_passage_retrieval_zh'
]},
{
'name'
:
'longbench_code-completion'
,
'subsets'
:
[
'LongBench_lcc'
,
'LongBench_repobench-p'
]},
{
'name'
:
'longbench_code-completion'
,
'subsets'
:
[
'LongBench_lcc'
,
'LongBench_repobench-p'
]},
{
'name'
:
'longbench'
,
'subsets'
:
[
'longbench_single-document-qa'
,
'longbench_multi-document-qa'
,
'longbench_summarization'
,
'longbench_few-shot-learning'
,
'longbench_synthetic-tasks'
,
'longbench_code-completion'
,
'longbench_code-completion'
]},
]
configs/summarizers/groups/mathbench.py
View file @
b4afe3e7
...
@@ -66,9 +66,9 @@ naive_mathbench_summary_groups = [
...
@@ -66,9 +66,9 @@ naive_mathbench_summary_groups = [
{
{
'name'
:
'mathbench-circular-and-cloze'
,
'name'
:
'mathbench-circular-and-cloze'
,
'subsets'
:
[
'subsets'
:
[
'mathbench-college-circular'
,
'mathbench-high-circular'
,
'mathbench-high-circular'
,
'mathbench-middle-circular'
,
'mathbench-middle-circular'
,
'mathbench-circular'
,
'mathbench-college-cloze_en'
,
'mathbench-college-cloze_en'
,
'mathbench-primary-cloze_cn'
,
'mathbench-primary-cloze_cn'
,
],
],
...
...
configs/summarizers/groups/mathbench_agent.py
View file @
b4afe3e7
...
@@ -65,9 +65,9 @@ mathbench_agent_summary_groups = [
...
@@ -65,9 +65,9 @@ mathbench_agent_summary_groups = [
{
{
'name'
:
'mathbench-circular-and-cloze-agent'
,
'name'
:
'mathbench-circular-and-cloze-agent'
,
'subsets'
:
[
'subsets'
:
[
'mathbench-college-circular-agent'
,
'mathbench-high-circular-agent'
,
'mathbench-high-circular-agent'
,
'mathbench-middle-circular-agent'
,
'mathbench-middle-circular-agent'
,
'mathbench-circular-agent'
,
'mathbench-college-cloze_en-agent'
,
'mathbench-college-cloze_en-agent'
,
'mathbench-primary-cloze_cn-agent'
,
'mathbench-primary-cloze_cn-agent'
,
],
],
...
...
configs/summarizers/groups/plugineval.py
View file @
b4afe3e7
plugineval_summary_groups
=
[
from
copy
import
deepcopy
_base_summary_groups
=
[
{
{
'name'
:
'plugin_eval-instruct_v1'
,
'name'
:
'plugin_eval-instruct_v1'
,
'metric'
:
'format_metric'
,
'metric'
:
'format_metric'
,
...
@@ -22,47 +24,41 @@ plugineval_summary_groups = [
...
@@ -22,47 +24,41 @@ plugineval_summary_groups = [
[
'plugin_eval-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-reason_str_v2'
,
'thought'
],
[
'plugin_eval-reason_str_v1'
,
'thought'
],
[
'plugin_eval-reason_retrieve_understand_json_v2'
,
'thought'
],
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'thought'
],
[
'plugin_eval-retrieve_str_v2'
,
'name'
],
[
'plugin_eval-retrieve_str_v1'
,
'name'
],
[
'plugin_eval-reason_retrieve_understand_json_v2'
,
'name'
],
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'name'
],
[
'plugin_eval-understand_str_v2'
,
'args'
],
[
'plugin_eval-understand_str_v1'
,
'args'
],
[
'plugin_eval-reason_retrieve_understand_json_v2'
,
'args'
],
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'args'
],
[
'plugin_eval-review_str_v6'
,
'review_quality'
],
[
'plugin_eval-review_str_v1'
,
'review_quality'
],
]
},
# special treatment for first 10% data points
{
'name'
:
'plugin_eval-p10-instruct_v1'
,
'metric'
:
'format_metric'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'string_format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'json_format_metric'
],
]
},
{
'name'
:
'plugin_eval-p10-instruct_v1'
,
'metric'
:
'args_em_metric'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'string_args_em_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'json_args_em_metric'
],
]
},
{
'name'
:
'plugin_eval-p10'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-p10-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-p10-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-p10-reason_str_v2'
,
'thought'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'thought'
],
[
'plugin_eval-p10-retrieve_str_v2'
,
'name'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'name'
],
[
'plugin_eval-p10-understand_str_v2'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'args'
],
[
'plugin_eval-p10-review_str_v6'
,
'review_quality'
],
]
]
},
},
]
]
plugineval_summary_groups
=
[]
# base
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
plugineval_summary_groups
.
append
(
group
)
# base _zh
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
group
[
'name'
]
=
group
[
'name'
]
+
'_zh'
group
[
'subsets'
]
=
[[
subset
[
0
]
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
plugineval_summary_groups
.
append
(
group
)
# base -p10-
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
group
[
'name'
]
=
group
[
'name'
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
)
group
[
'subsets'
]
=
[[
subset
[
0
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
),
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
plugineval_summary_groups
.
append
(
group
)
# base -p10- _zh
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
group
[
'name'
]
=
group
[
'name'
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
)
+
'_zh'
group
[
'subsets'
]
=
[[
subset
[
0
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
)
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
plugineval_summary_groups
.
append
(
group
)
configs/summarizers/internlm2_keyset.py
0 → 100644
View file @
b4afe3e7
from
mmengine.config
import
read_base
with
read_base
():
from
.groups.agieval
import
agieval_summary_groups
from
.groups.mmlu
import
mmlu_summary_groups
from
.groups.bbh
import
bbh_summary_groups
summarizer
=
dict
(
dataset_abbrs
=
[
[
'mmlu'
,
'naive_average'
],
[
'agieval'
,
'naive_average'
],
[
'bbh'
,
'naive_average'
],
[
'gsm8k'
,
'accuracy'
],
[
'math'
,
'accuracy'
],
[
'openai_humaneval'
,
'humaneval_pass@1'
],
[
'sanitized_mbpp'
,
'score'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[]),
)
configs/summarizers/leval.py
View file @
b4afe3e7
summarizer
=
dict
(
summarizer
=
dict
(
dataset_abbrs
=
[
dataset_abbrs
=
[
'--------- LEval Exact Match (Acc) ---------'
,
# category
'--------- LEval Exact Match (Acc) ---------'
,
# category
"
LEval_coursera
"
,
'
LEval_coursera
'
,
'LEval_gsm100'
,
'LEval_gsm100'
,
'LEval_quality'
,
'LEval_quality'
,
"
LEval_tpo
"
,
'
LEval_tpo
'
,
'LEval_topic_retrieval'
,
'LEval_topic_retrieval'
,
'--------- LEval Gen (ROUGE) ---------'
,
# category
'--------- LEval Gen (ROUGE) ---------'
,
# category
'LEval_financialqa'
,
'LEval_financialqa'
,
...
@@ -21,5 +21,5 @@ summarizer = dict(
...
@@ -21,5 +21,5 @@ summarizer = dict(
'LEval_scientificqa'
,
'LEval_scientificqa'
,
'LEval_tvshow_summ'
'LEval_tvshow_summ'
],
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment