Commit 17172a26 authored by lintangsutawika's avatar lintangsutawika
Browse files

temp save

parent 81b8e670
......@@ -38,6 +38,7 @@ def simple_evaluate(
write_out: bool = False,
log_samples: bool = True,
gen_kwargs: str = None,
weight_by_size: bool = False,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -155,6 +156,7 @@ def simple_evaluate(
decontamination_ngrams_path=decontamination_ngrams_path,
write_out=write_out,
log_samples=log_samples,
weight_by_size=weight_by_size,
)
if lm.rank == 0:
......@@ -192,6 +194,7 @@ def evaluate(
decontamination_ngrams_path=None,
write_out: bool = False,
log_samples: bool = True,
weight_by_size: bool = False,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -474,17 +477,22 @@ def evaluate(
total_size = 0
for task in task_list:
print("###")
print(task)
print(metrics)
print("###")
metrics = results[task].copy()
if "alias" in metrics:
metrics.pop("alias")
current_size = metrics.pop("samples")
# TODO: There should be a way for users
# to toggle between weighted and
# unweighted averaging
# For unweighted averaging, use:
# current_size = 1
if weight_by_size:
current_size = metrics.pop("samples")
else:
current_size = 1
all_stderr = []
for metric in [
......
......@@ -4,9 +4,10 @@ task:
- group: arc_stuff
task:
- arc_challenge
- glue
- task: arc_easy
metric_list:
- metric: acc
num_fewshot: 3
# - task: mmlu_stem
# num_fewshot: 2
\ No newline at end of file
# - task: mmlu
# num_fewshot: 2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment