Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2106fbeb
Commit
2106fbeb
authored
Jan 15, 2025
by
Baber
Browse files
Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
parents
4354fe46
703fbffd
Changes
574
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
257 additions
and
0 deletions
+257
-0
lm_eval/tasks/metabench/metabench.yaml
lm_eval/tasks/metabench/metabench.yaml
+14
-0
lm_eval/tasks/metabench/metabench_arc.yaml
lm_eval/tasks/metabench/metabench_arc.yaml
+23
-0
lm_eval/tasks/metabench/metabench_arc_permute.yaml
lm_eval/tasks/metabench/metabench_arc_permute.yaml
+5
-0
lm_eval/tasks/metabench/metabench_arc_secondary.yaml
lm_eval/tasks/metabench/metabench_arc_secondary.yaml
+5
-0
lm_eval/tasks/metabench/metabench_arc_secondary_permute.yaml
lm_eval/tasks/metabench/metabench_arc_secondary_permute.yaml
+5
-0
lm_eval/tasks/metabench/metabench_gsm8k.yaml
lm_eval/tasks/metabench/metabench_gsm8k.yaml
+46
-0
lm_eval/tasks/metabench/metabench_gsm8k_secondary.yaml
lm_eval/tasks/metabench/metabench_gsm8k_secondary.yaml
+5
-0
lm_eval/tasks/metabench/metabench_hellaswag.yaml
lm_eval/tasks/metabench/metabench_hellaswag.yaml
+23
-0
lm_eval/tasks/metabench/metabench_hellaswag_permute.yaml
lm_eval/tasks/metabench/metabench_hellaswag_permute.yaml
+5
-0
lm_eval/tasks/metabench/metabench_hellaswag_secondary.yaml
lm_eval/tasks/metabench/metabench_hellaswag_secondary.yaml
+5
-0
lm_eval/tasks/metabench/metabench_hellaswag_secondary_permute.yaml
...asks/metabench/metabench_hellaswag_secondary_permute.yaml
+5
-0
lm_eval/tasks/metabench/metabench_mmlu.yaml
lm_eval/tasks/metabench/metabench_mmlu.yaml
+20
-0
lm_eval/tasks/metabench/metabench_mmlu_permute.yaml
lm_eval/tasks/metabench/metabench_mmlu_permute.yaml
+5
-0
lm_eval/tasks/metabench/metabench_mmlu_secondary.yaml
lm_eval/tasks/metabench/metabench_mmlu_secondary.yaml
+5
-0
lm_eval/tasks/metabench/metabench_mmlu_secondary_permute.yaml
...val/tasks/metabench/metabench_mmlu_secondary_permute.yaml
+5
-0
lm_eval/tasks/metabench/metabench_permute.yaml
lm_eval/tasks/metabench/metabench_permute.yaml
+13
-0
lm_eval/tasks/metabench/metabench_secondary.yaml
lm_eval/tasks/metabench/metabench_secondary.yaml
+14
-0
lm_eval/tasks/metabench/metabench_secondary_permute.yaml
lm_eval/tasks/metabench/metabench_secondary_permute.yaml
+13
-0
lm_eval/tasks/metabench/metabench_truthfulqa.yaml
lm_eval/tasks/metabench/metabench_truthfulqa.yaml
+35
-0
lm_eval/tasks/metabench/metabench_truthfulqa_permute.yaml
lm_eval/tasks/metabench/metabench_truthfulqa_permute.yaml
+6
-0
No files found.
lm_eval/tasks/metabench/metabench.yaml
0 → 100644
View file @
2106fbeb
group
:
metabench
task
:
-
metabench_arc
-
metabench_gsm8k
-
metabench_hellaswag
-
metabench_mmlu
-
metabench_truthfulqa
-
metabench_winogrande
aggregate_metric_list
:
-
metric
:
acc
aggregation
:
mean
weight_by_size
:
false
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_arc.yaml
0 → 100644
View file @
2106fbeb
task
:
metabench_arc
tag
:
-
metabench_arc_subset
dataset_path
:
HCAI/metabench
dataset_name
:
ARC
process_docs
:
!function
process_docs.process_arc
output_type
:
multiple_choice
training_split
:
null
validation_split
:
null
test_split
:
primary
num_fewshot
:
0
doc_to_text
:
"
{{twentyfive_shot_preprompt}}Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{choices.label.index(answerKey)}}"
doc_to_choice
:
"
{{choices.text}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_arc_permute.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_arc.yaml
task
:
metabench_arc_permute
process_docs
:
!function
process_docs_permute.process_arc
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_arc_secondary.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_arc.yaml
task
:
metabench_arc_secondary
test_split
:
secondary
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_arc_secondary_permute.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_arc_permute.yaml
task
:
metabench_arc_secondary_permute
test_split
:
secondary
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_gsm8k.yaml
0 → 100644
View file @
2106fbeb
task
:
metabench_gsm8k
tag
:
-
metabench_gsm8k_subset
dataset_path
:
HCAI/metabench
dataset_name
:
GSM8K
process_docs
:
!function
process_docs.process_gsm8k
output_type
:
generate_until
training_split
:
null
validation_split
:
null
test_split
:
primary
doc_to_text
:
"
{{five_shot_preprompt}}Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{answer}}"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
false
regexes_to_ignore
:
-
"
,"
-
"
\\
$"
-
"
(?s).*####
"
-
"
\\
.$"
generation_kwargs
:
until
:
-
"
Question:"
-
"
</s>"
-
"
<|im_end|>"
do_sample
:
false
temperature
:
0.0
repeats
:
1
num_fewshot
:
0
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
regex"
regex_pattern
:
"
####
(
\\
-?[0-9
\\
.
\\
,]+)"
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
"
regex"
group_select
:
-1
regex_pattern
:
"
(-?[$0-9.,]{2,})|(-?[0-9]+)"
-
function
:
"
take_first"
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_gsm8k_secondary.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_gsm8k.yaml
task
:
metabench_gsm8k_secondary
test_split
:
secondary
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_hellaswag.yaml
0 → 100644
View file @
2106fbeb
task
:
metabench_hellaswag
tag
:
-
metabench_hellaswag_subset
dataset_path
:
HCAI/metabench
dataset_name
:
HellaSwag
process_docs
:
!function
process_docs.process_hellaswag
output_type
:
multiple_choice
training_split
:
null
validation_split
:
null
test_split
:
primary
num_fewshot
:
0
doc_to_text
:
"
{{ten_shot_preprompt}}{{query}}"
doc_to_target
:
"
{{label}}"
doc_to_choice
:
"
choices"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_hellaswag_permute.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_hellaswag.yaml
task
:
metabench_hellaswag_permute
process_docs
:
!function
process_docs_permute.process_hellaswag
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_hellaswag_secondary.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_hellaswag.yaml
task
:
metabench_hellaswag_secondary
test_split
:
secondary
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_hellaswag_secondary_permute.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_hellaswag_permute.yaml
task
:
metabench_hellaswag_secondary_permute
test_split
:
secondary
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_mmlu.yaml
0 → 100644
View file @
2106fbeb
task
:
metabench_mmlu
tag
:
-
metabench_mmlu_subset
dataset_path
:
HCAI/metabench
dataset_name
:
MMLU
process_docs
:
!function
process_docs.process_mmlu
output_type
:
multiple_choice
training_split
:
null
validation_split
:
null
test_split
:
primary
num_fewshot
:
0
doc_to_text
:
"
{{five_shot_preprompt}}{{question.strip()}}
\n
A.
{{choices[0]}}
\n
B.
{{choices[1]}}
\n
C.
{{choices[2]}}
\n
D.
{{choices[3]}}
\n
Answer:"
doc_to_choice
:
[
"
A"
,
"
B"
,
"
C"
,
"
D"
]
doc_to_target
:
answer
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_mmlu_permute.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_mmlu.yaml
task
:
metabench_mmlu_permute
process_docs
:
!function
process_docs_permute.process_mmlu
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_mmlu_secondary.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_mmlu.yaml
task
:
metabench_mmlu_secondary
test_split
:
secondary
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_mmlu_secondary_permute.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_mmlu_permute.yaml
task
:
metabench_mmlu_secondary_permute
test_split
:
secondary
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_permute.yaml
0 → 100644
View file @
2106fbeb
group
:
metabench_permute
task
:
-
metabench_arc_permute
-
metabench_hellaswag_permute
-
metabench_mmlu_permute
-
metabench_truthfulqa_permute
-
metabench_winogrande_permute
aggregate_metric_list
:
-
metric
:
acc
aggregation
:
mean
weight_by_size
:
false
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_secondary.yaml
0 → 100644
View file @
2106fbeb
group
:
metabench_secondary
task
:
-
metabench_arc_secondary
-
metabench_gsm8k_secondary
-
metabench_hellaswag_secondary
-
metabench_mmlu_secondary
-
metabench_truthfulqa_secondary
-
metabench_winogrande_secondary
aggregate_metric_list
:
-
metric
:
acc
aggregation
:
mean
weight_by_size
:
false
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_secondary_permute.yaml
0 → 100644
View file @
2106fbeb
group
:
metabench_secondary_permute
task
:
-
metabench_arc_secondary_permute
-
metabench_hellaswag_secondary_permute
-
metabench_mmlu_secondary_permute
-
metabench_truthfulqa_secondary_permute
-
metabench_winogrande_secondary_permute
aggregate_metric_list
:
-
metric
:
acc
aggregation
:
mean
weight_by_size
:
false
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_truthfulqa.yaml
0 → 100644
View file @
2106fbeb
task
:
metabench_truthfulqa
tag
:
-
metabench_truthfulqa_subset
dataset_path
:
HCAI/metabench
dataset_name
:
TruthfulQA
process_docs
:
!function
process_docs.process_truthfulqa
output_type
:
multiple_choice
training_split
:
null
validation_split
:
null
test_split
:
primary
num_fewshot
:
0
doc_to_text
:
"
\
{%
set
prompt_qa
=
'
\
Q:
What
is
human
life
expectancy
in
the
United
States?
\n\
A:
Human
life
expectancy
in
the
United
States
is
78
years.
\n\n\
Q:
Who
was
president
of
the
United
States
in
1955?
\n\
A:
Dwight
D.
Eisenhower
was
president
of
the
United
States
in
1955.
\n\n\
Q:
Which
party
did
he
belong
to?
\n\
A:
He
belonged
to
the
Republican
Party.
\n\n\
Q:
What
is
the
square
root
of
banana?
\n\
A:
I
have
no
comment.
\n\n\
Q:
How
does
a
telescope
work?
\n\
A:
Telescopes
use
lenses
or
mirrors
to
focus
light
and
make
objects
appear
closer.
\n\n\
Q:
Where
were
the
1992
Olympics
held?
\n\
A:
The
1992
Olympics
were
held
in
Barcelona,
Spain.
\
'%}
\
{{prompt_qa
+
'
\n\n
Q:
'
+
question
+
'
\n
A:'}}"
doc_to_target
:
0
doc_to_choice
:
"
{{mc1_targets.choices}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
0.0
lm_eval/tasks/metabench/metabench_truthfulqa_permute.yaml
0 → 100644
View file @
2106fbeb
include
:
metabench_truthfulqa.yaml
task
:
metabench_truthfulqa_permute
process_docs
:
!function
process_docs_permute.process_truthfulqa
doc_to_target
:
answer
metadata
:
version
:
0.0
Prev
1
…
16
17
18
19
20
21
22
23
24
…
29
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment