Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e4db76cb
"app/vscode:/vscode.git/clone" did not exist on "873a35be0e9f77a4d40a096f43734e12cb4c8139"
Commit
e4db76cb
authored
Jul 09, 2024
by
haileyschoelkopf
Browse files
Merge branch 'main' into multimodal-prototyping
parents
6cc6e9cd
ad80f555
Changes
871
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
30 additions
and
171 deletions
+30
-171
lm_eval/tasks/translation/wmt14_en-fr.yaml
lm_eval/tasks/translation/wmt14_en-fr.yaml
+1
-2
lm_eval/tasks/translation/wmt14_fr-en.yaml
lm_eval/tasks/translation/wmt14_fr-en.yaml
+1
-2
lm_eval/tasks/translation/wmt16_de-en.yaml
lm_eval/tasks/translation/wmt16_de-en.yaml
+1
-2
lm_eval/tasks/translation/wmt16_en-de.yaml
lm_eval/tasks/translation/wmt16_en-de.yaml
+1
-2
lm_eval/tasks/translation/wmt16_en-ro.yaml
lm_eval/tasks/translation/wmt16_en-ro.yaml
+1
-2
lm_eval/tasks/translation/wmt16_ro-en.yaml
lm_eval/tasks/translation/wmt16_ro-en.yaml
+1
-2
lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
+1
-1
lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
+1
-1
lm_eval/tasks/unitxt/20_newsgroups.yaml
lm_eval/tasks/unitxt/20_newsgroups.yaml
+2
-2
lm_eval/tasks/unitxt/ag_news.yaml
lm_eval/tasks/unitxt/ag_news.yaml
+2
-2
lm_eval/tasks/unitxt/argument_topic.yaml
lm_eval/tasks/unitxt/argument_topic.yaml
+2
-2
lm_eval/tasks/unitxt/atis.yaml
lm_eval/tasks/unitxt/atis.yaml
+2
-2
lm_eval/tasks/unitxt/banking77.yaml
lm_eval/tasks/unitxt/banking77.yaml
+2
-2
lm_eval/tasks/unitxt/claim_stance_topic.yaml
lm_eval/tasks/unitxt/claim_stance_topic.yaml
+2
-2
lm_eval/tasks/unitxt/cnn_dailymail.yaml
lm_eval/tasks/unitxt/cnn_dailymail.yaml
+2
-2
lm_eval/tasks/unitxt/coedit_gec.yaml
lm_eval/tasks/unitxt/coedit_gec.yaml
+2
-2
lm_eval/tasks/unitxt/dbpedia_14.yaml
lm_eval/tasks/unitxt/dbpedia_14.yaml
+2
-2
lm_eval/tasks/unitxt/ethos_binary.yaml
lm_eval/tasks/unitxt/ethos_binary.yaml
+2
-2
lm_eval/tasks/unitxt/financial_tweets.yaml
lm_eval/tasks/unitxt/financial_tweets.yaml
+2
-2
lm_eval/tasks/unitxt/generate_yamls.py
lm_eval/tasks/unitxt/generate_yamls.py
+0
-135
No files found.
lm_eval/tasks/translation/wmt14_en-fr.yaml
View file @
e4db76cb
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["fr"]}}'
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["fr"]}}'
doc_to_text
:
'
English
phrase:
{{translation["en"]}}
doc_to_text
:
'
English
phrase:
{{translation["en"]}}
French
phrase:'
French
phrase:'
group
:
tag
:
-
generate_until
-
translation
-
translation
-
wmt14
-
wmt14
-
gpt3_translation_benchmarks
-
gpt3_translation_benchmarks
...
...
lm_eval/tasks/translation/wmt14_fr-en.yaml
View file @
e4db76cb
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
doc_to_text
:
'
French
phrase:
{{translation["fr"]}}
doc_to_text
:
'
French
phrase:
{{translation["fr"]}}
English
phrase:'
English
phrase:'
group
:
tag
:
-
generate_until
-
translation
-
translation
-
wmt14
-
wmt14
-
gpt3_translation_benchmarks
-
gpt3_translation_benchmarks
...
...
lm_eval/tasks/translation/wmt16_de-en.yaml
View file @
e4db76cb
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
doc_to_text
:
'
German
phrase:
{{translation["de"]}}
doc_to_text
:
'
German
phrase:
{{translation["de"]}}
English
phrase:'
English
phrase:'
group
:
tag
:
-
generate_until
-
translation
-
translation
-
wmt16
-
wmt16
-
gpt3_translation_benchmarks
-
gpt3_translation_benchmarks
...
...
lm_eval/tasks/translation/wmt16_en-de.yaml
View file @
e4db76cb
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["de"]}}'
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["de"]}}'
doc_to_text
:
'
English
phrase:
{{translation["en"]}}
doc_to_text
:
'
English
phrase:
{{translation["en"]}}
German
phrase:'
German
phrase:'
group
:
tag
:
-
generate_until
-
translation
-
translation
-
wmt16
-
wmt16
-
gpt3_translation_benchmarks
-
gpt3_translation_benchmarks
...
...
lm_eval/tasks/translation/wmt16_en-ro.yaml
View file @
e4db76cb
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["ro"]}}'
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["ro"]}}'
doc_to_text
:
'
English
phrase:
{{translation["en"]}}
doc_to_text
:
'
English
phrase:
{{translation["en"]}}
Romanian
phrase:'
Romanian
phrase:'
group
:
tag
:
-
generate_until
-
translation
-
translation
-
wmt16
-
wmt16
-
gpt3_translation_benchmarks
-
gpt3_translation_benchmarks
...
...
lm_eval/tasks/translation/wmt16_ro-en.yaml
View file @
e4db76cb
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
...
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
doc_to_text
:
'
Romanian
phrase:
{{translation["ro"]}}
doc_to_text
:
'
Romanian
phrase:
{{translation["ro"]}}
English
phrase:'
English
phrase:'
group
:
tag
:
-
generate_until
-
translation
-
translation
-
wmt16
-
wmt16
-
gpt3_translation_benchmarks
-
gpt3_translation_benchmarks
...
...
lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
View file @
e4db76cb
group
:
tag
:
-
truthfulqa
-
truthfulqa
task
:
truthfulqa_gen
task
:
truthfulqa_gen
dataset_path
:
truthful_qa
dataset_path
:
truthful_qa
...
...
lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
View file @
e4db76cb
group
:
tag
:
-
truthfulqa
-
truthfulqa
task
:
truthfulqa_mc1
task
:
truthfulqa_mc1
dataset_path
:
truthful_qa
dataset_path
:
truthful_qa
...
...
lm_eval/tasks/unitxt/20_newsgroups.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
20_newsgroups
task
:
20_newsgroups
dataset_name
:
card=cards.20_newsgroups,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.20_newsgroups,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/ag_news.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
ag_news
task
:
ag_news
dataset_name
:
card=cards.ag_news,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.ag_news,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/argument_topic.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
argument_topic
task
:
argument_topic
dataset_name
:
card=cards.argument_topic,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.argument_topic,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/atis.yaml
View file @
e4db76cb
include
:
unitxt_tasks.span_labeling.extraction
task
:
atis
task
:
atis
dataset_name
:
card=cards.atis,template=templates.span_labeling.extraction.title
include
:
unitxt
recipe
:
card=cards.atis,template=templates.span_labeling.extraction.title
lm_eval/tasks/unitxt/banking77.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
banking77
task
:
banking77
dataset_name
:
card=cards.banking77,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.banking77,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/claim_stance_topic.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
claim_stance_topic
task
:
claim_stance_topic
dataset_name
:
card=cards.claim_stance_topic,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.claim_stance_topic,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/cnn_dailymail.yaml
View file @
e4db76cb
include
:
unitxt_tasks.summarization.abstractive
task
:
cnn_dailymail
task
:
cnn_dailymail
dataset_name
:
card=cards.cnn_dailymail,template=templates.summarization.abstractive.full
include
:
unitxt
recipe
:
card=cards.cnn_dailymail,template=templates.summarization.abstractive.full
lm_eval/tasks/unitxt/coedit_gec.yaml
View file @
e4db76cb
include
:
unitxt_tasks.grammatical_error_correction
task
:
coedit_gec
task
:
coedit_gec
dataset_name
:
card=cards.coedit_gec,template=templates.grammatical_error_correction.simple
include
:
unitxt
recipe
:
card=cards.coedit_gec,template=templates.grammatical_error_correction.simple
lm_eval/tasks/unitxt/dbpedia_14.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
dbpedia_14
task
:
dbpedia_14
dataset_name
:
card=cards.dbpedia_14,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.dbpedia_14,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/ethos_binary.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
ethos_binary
task
:
ethos_binary
dataset_name
:
card=cards.ethos_binary,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.ethos_binary,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/financial_tweets.yaml
View file @
e4db76cb
include
:
unitxt_tasks.classification.multi_class
task
:
financial_tweets
task
:
financial_tweets
dataset_name
:
card=cards.financial_tweets,template=templates.classification.multi_class.title
include
:
unitxt
recipe
:
card=cards.financial_tweets,template=templates.classification.multi_class.title
lm_eval/tasks/unitxt/generate_yamls.py
deleted
100644 → 0
View file @
6cc6e9cd
#
# This file generates a set of LM eval harness yaml file
# that load unitxt datasets (https://github.com/IBM/unitxt)
#
import
unitxt_wrapper
import
yaml
from
unitxt.artifact
import
fetch_artifact
from
unitxt.standard
import
StandardRecipe
# This code is required to properly dump LM harness YAML that contains references to functions
def
function_representer
(
dumper
:
yaml
.
SafeDumper
,
func
)
->
yaml
.
nodes
.
MappingNode
:
return
dumper
.
represent_scalar
(
"!function"
,
f
"
{
func
.
__module__
}
.
{
func
.
__name__
}
"
,
style
=
None
)
def
write_task_yaml
(
filename
,
data
):
yaml
.
add_representer
(
type
(
data
[
"process_results"
]),
function_representer
)
with
open
(
filename
,
"w"
)
as
stream
:
yaml
.
dump
(
data
,
stream
,
sort_keys
=
False
)
def
write_card_yaml
(
filename
,
data
):
with
open
(
filename
,
"w"
)
as
stream
:
yaml
.
dump
(
data
,
stream
,
sort_keys
=
False
)
default_template_per_task
=
{
"tasks.classification.multi_label"
:
"templates.classification.multi_label.title"
,
"tasks.classification.multi_class"
:
"templates.classification.multi_class.title"
,
"tasks.summarization.abstractive"
:
"templates.summarization.abstractive.full"
,
"tasks.regression.two_texts"
:
"templates.regression.two_texts.simple"
,
"tasks.qa.with_context.extractive"
:
"templates.qa.with_context.simple"
,
"tasks.grammatical_error_correction"
:
"templates.grammatical_error_correction.simple"
,
"tasks.span_labeling.extraction"
:
"templates.span_labeling.extraction.title"
,
}
def
generate_task_yaml
(
task
:
str
):
"""
Generate an LM Eval Harness YAML file based on a Unitxt task defintion.
The output YAML is based on 'template.yaml.file' found in current directoy.
The common template is filled the the specific metrics for the task.
It still leaves the 'dataset_name' and 'task name' unspecified.
"""
print
(
"*"
*
80
)
print
(
"*"
)
print
(
f
"* Generating YAML base file for task
{
task
}
"
)
print
(
"*"
)
task_definition
,
_
=
fetch_artifact
(
task
)
data
=
{
"group"
:
[
"unitxt"
],
"dataset_path"
:
"unitxt/data"
,
"output_type"
:
"generate_until"
,
"training_split"
:
"train"
,
"validation_split"
:
"test"
,
"doc_to_text"
:
"{{source}}"
,
"doc_to_target"
:
"target"
,
"process_results"
:
unitxt_wrapper
.
process_results
,
"generation_kwargs"
:
{
"until"
:
[
"</s>"
]},
"metric_list"
:
[],
"metadata"
:
{
"verison"
:
1.0
},
}
for
metric_name
in
task_definition
.
metrics
:
new_metric
=
{
"metric"
:
""
,
"aggregation"
:
"unitxt"
,
"higher_is_better"
:
True
}
new_metric
[
"metric"
]
=
metric_name
.
replace
(
"metrics."
,
"unitxt_"
)
data
[
"metric_list"
].
append
(
new_metric
)
write_task_yaml
(
f
"unitxt_
{
task
}
"
,
data
)
def
generate_card_yaml
(
card
:
str
):
"""
Generate an LM Eval Harness YAML file based on the Unitxt dataset card.
It includes the task YAML for the dataset, and overrides the 'dataset_name' and 'task' with the card.
"""
print
(
"*"
*
80
)
print
(
"*"
)
print
(
f
"* Generating YAML file for unitxt dataset
{
card
}
"
)
print
(
"*"
)
card_definition
,
_
=
fetch_artifact
(
f
"cards.
{
card
}
"
)
task
=
card_definition
.
task
.
__id__
if
task
in
default_template_per_task
:
template
=
default_template_per_task
[
task
]
else
:
raise
ValueError
(
f
"Default template was not defined for task
{
task
}
in 'default_template_per_task' dict in generate_yamls.py"
)
data
=
{}
data
[
"include"
]
=
f
"unitxt_
{
task
}
"
data
[
"task"
]
=
card
data
[
"dataset_name"
]
=
f
"card=cards.
{
card
}
,template=
{
template
}
"
# This is faster that the load_dataset approach
# dataset = load_dataset('unitxt/data', data["dataset_name"]+",loader_limit=100",trust_remote_code=True)
recipe
=
StandardRecipe
(
card
=
f
"cards.
{
card
}
"
,
template
=
template
,
loader_limit
=
100
)
stream
=
recipe
()
dataset
=
stream
.
to_dataset
()
print
(
dataset
)
print
(
"Sample input:"
)
print
(
dataset
[
"test"
][
0
][
"source"
])
print
(
"Sample output:"
)
print
(
dataset
[
"test"
][
0
][
"target"
])
write_card_yaml
(
f
"
{
card
}
.yaml"
,
data
)
def
main
():
for
task
in
default_template_per_task
.
keys
():
try
:
generate_task_yaml
(
task
)
except
Exception
as
e
:
print
(
f
"Unable to generate YAML for
{
task
}
due to:"
)
print
(
e
)
raise
(
e
)
with
open
(
"unitxt_datasets"
)
as
f
:
for
unitxt_dataset
in
f
:
unitxt_dataset
=
unitxt_dataset
.
strip
()
if
unitxt_dataset
.
startswith
(
"### END ###"
):
exit
(
0
)
if
not
unitxt_dataset
.
startswith
(
"#"
):
try
:
generate_card_yaml
(
unitxt_dataset
)
except
Exception
as
e
:
print
(
f
"Unable to generate YAML for
{
unitxt_dataset
}
due to:"
)
print
(
e
)
raise
e
if
__name__
==
"__main__"
:
main
()
Prev
1
…
37
38
39
40
41
42
43
44
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment