Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cda25fef
Unverified
Commit
cda25fef
authored
Jan 02, 2024
by
Lintang Sutawika
Committed by
GitHub
Jan 02, 2024
Browse files
Merge branch 'main' into standardize_metrics
parents
dfb41835
4d10ad56
Changes
249
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
30 additions
and
41 deletions
+30
-41
lm_eval/tasks/hellaswag/hellaswag.yaml
lm_eval/tasks/hellaswag/hellaswag.yaml
+1
-1
lm_eval/tasks/hendrycks_ethics/commonsense.yaml
lm_eval/tasks/hendrycks_ethics/commonsense.yaml
+1
-1
lm_eval/tasks/hendrycks_ethics/deontology.yaml
lm_eval/tasks/hendrycks_ethics/deontology.yaml
+1
-1
lm_eval/tasks/hendrycks_ethics/justice.yaml
lm_eval/tasks/hendrycks_ethics/justice.yaml
+1
-1
lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+1
-1
lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
+1
-1
lm_eval/tasks/hendrycks_ethics/virtue.yaml
lm_eval/tasks/hendrycks_ethics/virtue.yaml
+1
-1
lm_eval/tasks/ifeval/ifeval.yaml
lm_eval/tasks/ifeval/ifeval.yaml
+1
-1
lm_eval/tasks/ifeval/instructions_registry.py
lm_eval/tasks/ifeval/instructions_registry.py
+9
-18
lm_eval/tasks/ifeval/instructions_util.py
lm_eval/tasks/ifeval/instructions_util.py
+0
-1
lm_eval/tasks/kmmlu/_default_kmmlu_yaml
lm_eval/tasks/kmmlu/_default_kmmlu_yaml
+4
-5
lm_eval/tasks/kmmlu/kmmlu_agricultural_sciences.yaml
lm_eval/tasks/kmmlu/kmmlu_agricultural_sciences.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_aviation_engineering_and_maintenance.yaml
...sks/kmmlu/kmmlu_aviation_engineering_and_maintenance.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_chemical_engineering.yaml
lm_eval/tasks/kmmlu/kmmlu_chemical_engineering.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_civil_engineering.yaml
lm_eval/tasks/kmmlu/kmmlu_civil_engineering.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_computer_science.yaml
lm_eval/tasks/kmmlu/kmmlu_computer_science.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_criminal_law.yaml
lm_eval/tasks/kmmlu/kmmlu_criminal_law.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_electrical_engineering.yaml
lm_eval/tasks/kmmlu/kmmlu_electrical_engineering.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_electronics_engineering.yaml
lm_eval/tasks/kmmlu/kmmlu_electronics_engineering.yaml
+1
-1
lm_eval/tasks/kmmlu/kmmlu_energy_management.yaml
lm_eval/tasks/kmmlu/kmmlu_energy_management.yaml
+1
-1
No files found.
lm_eval/tasks/hellaswag/hellaswag.yaml
View file @
cda25fef
...
...
@@ -19,4 +19,4 @@ metric_list:
aggregation
:
mean
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/hendrycks_ethics/commonsense.yaml
View file @
cda25fef
...
...
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/hendrycks_ethics/deontology.yaml
View file @
cda25fef
...
...
@@ -5,5 +5,5 @@ doc_to_text: "Question: Would most people believe this reasonable or unreasonabl
doc_to_target
:
label
doc_to_choice
:
[
'
unreasonable'
,
'
reasonable'
]
metadata
:
-
version
:
1.0
version
:
1.0
# TODO: implement exact-match metric for this subset
lm_eval/tasks/hendrycks_ethics/justice.yaml
View file @
cda25fef
...
...
@@ -6,4 +6,4 @@ dataset_name: justice
doc_to_text
:
"
Question:
Would
most
people
believe
this
reasonable
or
unreasonable
to
say?
\"
{{scenario}}
\"\n
Answer:"
# TODO: impl. exact match for this and deontology
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
View file @
cda25fef
...
...
@@ -9,4 +9,4 @@ doc_to_choice: ['no', 'yes']
metric_list
:
-
metric
:
acc
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
View file @
cda25fef
...
...
@@ -13,4 +13,4 @@
# - metric: acc
# TODO: we want this to be implemented as a winograd_schema task type, actually
# metadata:
#
-
version: 1.0
# version: 1.0
lm_eval/tasks/hendrycks_ethics/virtue.yaml
View file @
cda25fef
...
...
@@ -7,4 +7,4 @@ doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sente
doc_to_target
:
label
doc_to_choice
:
[
'
no'
,
'
yes'
]
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/ifeval/ifeval.yaml
View file @
cda25fef
...
...
@@ -26,4 +26,4 @@ metric_list:
aggregation
:
!function
utils.agg_inst_level_acc
higher_is_better
:
true
metadata
:
-
version
:
1.0
version
:
1.0
lm_eval/tasks/ifeval/instructions_registry.py
View file @
cda25fef
...
...
@@ -78,8 +78,7 @@ INSTRUCTION_CONFLICTS = {
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
_KEYWORD
+
"forbidden_words"
:
{
_KEYWORD
+
"forbidden_words"
},
_KEYWORD
+
"letter_frequency"
:
{
_KEYWORD
+
"letter_frequency"
},
_LANGUAGE
+
"response_language"
:
{
_LANGUAGE
+
"response_language"
:
{
_LANGUAGE
+
"response_language"
,
_FORMAT
+
"multiple_sections"
,
_KEYWORD
+
"existence"
,
...
...
@@ -90,16 +89,14 @@ INSTRUCTION_CONFLICTS = {
_CHANGE_CASES
+
"english_lowercase"
,
},
_LENGTH
+
"number_sentences"
:
{
_LENGTH
+
"number_sentences"
},
_LENGTH
+
"number_paragraphs"
:
{
_LENGTH
+
"number_paragraphs"
:
{
_LENGTH
+
"number_paragraphs"
,
_LENGTH
+
"nth_paragraph_first_word"
,
_LENGTH
+
"number_sentences"
,
_LENGTH
+
"nth_paragraph_first_word"
,
},
_LENGTH
+
"number_words"
:
{
_LENGTH
+
"number_words"
},
_LENGTH
+
"nth_paragraph_first_word"
:
{
_LENGTH
+
"nth_paragraph_first_word"
:
{
_LENGTH
+
"nth_paragraph_first_word"
,
_LENGTH
+
"number_paragraphs"
,
},
...
...
@@ -110,23 +107,20 @@ INSTRUCTION_CONFLICTS = {
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
_FORMAT
+
"constrained_response"
:
set
(
INSTRUCTION_DICT
.
keys
()),
_FORMAT
+
"number_highlighted_sections"
:
{
_FORMAT
+
"number_highlighted_sections"
},
_FORMAT
+
"multiple_sections"
:
{
_FORMAT
+
"multiple_sections"
:
{
_FORMAT
+
"multiple_sections"
,
_LANGUAGE
+
"response_language"
,
_FORMAT
+
"number_highlighted_sections"
,
},
# TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
# _FORMAT + "rephrase": instructions.RephraseChecker,
_FORMAT
+
"json_format"
:
set
(
INSTRUCTION_DICT
.
keys
()).
difference
(
_FORMAT
+
"json_format"
:
set
(
INSTRUCTION_DICT
.
keys
()).
difference
(
{
_KEYWORD
+
"forbidden_words"
,
_KEYWORD
+
"existence"
}
),
_FORMAT
+
"title"
:
{
_FORMAT
+
"title"
},
# TODO(tianjianlu): Re-enable with specific prompts.
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
_COMBINATION
+
"two_responses"
:
set
(
INSTRUCTION_DICT
.
keys
()).
difference
(
_COMBINATION
+
"two_responses"
:
set
(
INSTRUCTION_DICT
.
keys
()).
difference
(
{
_KEYWORD
+
"forbidden_words"
,
_KEYWORD
+
"existence"
,
...
...
@@ -135,20 +129,17 @@ INSTRUCTION_CONFLICTS = {
_PUNCTUATION
+
"no_comma"
,
}
),
_COMBINATION
+
"repeat_prompt"
:
set
(
INSTRUCTION_DICT
.
keys
()).
difference
(
_COMBINATION
+
"repeat_prompt"
:
set
(
INSTRUCTION_DICT
.
keys
()).
difference
(
{
_KEYWORD
+
"existence"
,
_FORMAT
+
"title"
,
_PUNCTUATION
+
"no_comma"
}
),
_STARTEND
+
"end_checker"
:
{
_STARTEND
+
"end_checker"
},
_CHANGE_CASES
+
"capital_word_frequency"
:
{
_CHANGE_CASES
+
"capital_word_frequency"
:
{
_CHANGE_CASES
+
"capital_word_frequency"
,
_CHANGE_CASES
+
"english_lowercase"
,
_CHANGE_CASES
+
"english_capital"
,
},
_CHANGE_CASES
+
"english_capital"
:
{
_CHANGE_CASES
+
"english_capital"
},
_CHANGE_CASES
+
"english_lowercase"
:
{
_CHANGE_CASES
+
"english_lowercase"
:
{
_CHANGE_CASES
+
"english_lowercase"
,
_CHANGE_CASES
+
"english_capital"
,
},
...
...
lm_eval/tasks/ifeval/instructions_util.py
View file @
cda25fef
...
...
@@ -17,7 +17,6 @@
import
functools
import
random
import
re
from
typing
import
List
import
immutabledict
import
nltk
...
...
lm_eval/tasks/kmmlu/_default_kmmlu_yaml
View file @
cda25fef
...
...
@@ -6,10 +6,9 @@ validation_split: dev
test_split: test
fewshot_split: dev
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question}}"
doc_to_choice: "{{choices}}"
doc_to_target: "{{gold}}"
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}"
metric_list:
- metric: acc
aggregation: mean
...
...
@@ -18,4 +17,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
-
version:
0.0
version:
1.1
lm_eval/tasks/kmmlu/kmmlu_agricultural_sciences.yaml
View file @
cda25fef
"
dataset_name"
:
"
Agricultural
Sciences"
"
dataset_name"
:
"
Agricultural
-
Sciences"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_agricultural_sciences"
lm_eval/tasks/kmmlu/kmmlu_aviation_engineering_and_maintenance.yaml
View file @
cda25fef
"
dataset_name"
:
"
Aviation
Engineering
and
Maintenance"
"
dataset_name"
:
"
Aviation
-
Engineering
-
and
-
Maintenance"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_aviation_engineering_and_maintenance"
lm_eval/tasks/kmmlu/kmmlu_chemical_engineering.yaml
View file @
cda25fef
"
dataset_name"
:
"
Chemical
Engineering"
"
dataset_name"
:
"
Chemical
-
Engineering"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_chemical_engineering"
lm_eval/tasks/kmmlu/kmmlu_civil_engineering.yaml
View file @
cda25fef
"
dataset_name"
:
"
Civil
Engineering"
"
dataset_name"
:
"
Civil
-
Engineering"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_civil_engineering"
lm_eval/tasks/kmmlu/kmmlu_computer_science.yaml
View file @
cda25fef
"
dataset_name"
:
"
Computer
Science"
"
dataset_name"
:
"
Computer
-
Science"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_computer_science"
lm_eval/tasks/kmmlu/kmmlu_criminal_law.yaml
View file @
cda25fef
"
dataset_name"
:
"
Criminal
Law"
"
dataset_name"
:
"
Criminal
-
Law"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_criminal_law"
lm_eval/tasks/kmmlu/kmmlu_electrical_engineering.yaml
View file @
cda25fef
"
dataset_name"
:
"
Electrical
Engineering"
"
dataset_name"
:
"
Electrical
-
Engineering"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_electrical_engineering"
lm_eval/tasks/kmmlu/kmmlu_electronics_engineering.yaml
View file @
cda25fef
"
dataset_name"
:
"
Electronics
Engineering"
"
dataset_name"
:
"
Electronics
-
Engineering"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_electronics_engineering"
lm_eval/tasks/kmmlu/kmmlu_energy_management.yaml
View file @
cda25fef
"
dataset_name"
:
"
Energy
Management"
"
dataset_name"
:
"
Energy
-
Management"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_energy_management"
Prev
1
2
3
4
5
6
7
8
9
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment