Commit e37698df authored by lintangsutawika's avatar lintangsutawika
Browse files

update on metrics and delet files

parent 1bc408ff
...@@ -555,12 +555,17 @@ class ConfigurableTask(Task): ...@@ -555,12 +555,17 @@ class ConfigurableTask(Task):
if key not in ["metric", "aggregation", "higher_is_better"] if key not in ["metric", "aggregation", "higher_is_better"]
} }
if self._config.process_results is None: if self._config.process_results is not None:
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._metric_fn_kwargs[metric_name] = kwargs
else:
self._metric_fn_list[metric_name] = None self._metric_fn_list[metric_name] = None
self._metric_fn_kwargs[metric_name] = {} self._metric_fn_kwargs[metric_name] = {}
elif callable(metric_name):
metric_fn = metric_name.__call__
metric_name = metric_name.__name__
self._metric_fn_list[metric_name] = metric_fn
self._metric_fn_kwargs[metric_name] = kwargs
else:
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._metric_fn_kwargs[metric_name] = kwargs
if "aggregation" in metric_config: if "aggregation" in metric_config:
agg_name = metric_config["aggregation"] agg_name = metric_config["aggregation"]
...@@ -987,6 +992,7 @@ class ConfigurableTask(Task): ...@@ -987,6 +992,7 @@ class ConfigurableTask(Task):
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
gold = choices[gold] gold = choices[gold]
print(self._metric_fn_list)
for key, result in zip(self._metric_fn_list.keys(), results): for key, result in zip(self._metric_fn_list.keys(), results):
if self.multiple_target: if self.multiple_target:
# in the case where we have multiple targets, # in the case where we have multiple targets,
......
...@@ -419,10 +419,14 @@ def evaluate( ...@@ -419,10 +419,14 @@ def evaluate(
versions[group] = "N/A" versions[group] = "N/A"
results_dict = { results_dict = {
"results": dict(results), "results": dict(sorted(results.items())),
**({"aggregate": dict(aggregate)} if bool(aggregate) else {}), **(
"configs": dict(configs), {"aggregate": dict(sorted(aggregate.items()))}
"versions": dict(versions), if bool(aggregate)
else {}
),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
} }
if log_samples: if log_samples:
results_dict["samples"] = dict(samples) results_dict["samples"] = dict(samples)
......
...@@ -30,6 +30,7 @@ task: ...@@ -30,6 +30,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -37,17 +38,17 @@ task: ...@@ -37,17 +38,17 @@ task:
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
# Natural Language Inference # Natural Language Inference
- dataset_path: super_glue # - dataset_path: super_glue
dataset_name: rte # dataset_name: rte
use_prompt: promptsource:* # use_prompt: promptsource:*
training_split: train # training_split: train
validation_split: validation # validation_split: validation
metric_list: # metric_list:
- metric: exact_match # - metric: exact_match
aggregation: mean # aggregation: mean
higher_is_better: true # higher_is_better: true
ignore_case: true # ignore_case: true
ignore_punctuation: true # ignore_punctuation: true
# # Natural Language Inference # # Natural Language Inference
# # - dataset_path: anli # # - dataset_path: anli
# # use_prompt: promptsource:* # # use_prompt: promptsource:*
......
...@@ -15,5 +15,5 @@ metric_list: ...@@ -15,5 +15,5 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
# - metric: f1 - metric: f1
# aggregation: !function "aggregate.cb_multi_fi" aggregation: !function "aggregate.cb_multi_fi"
...@@ -6,7 +6,7 @@ dataset_name: rte ...@@ -6,7 +6,7 @@ dataset_name: rte
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
doc_to_text: "{{sentence1}}\nQuestion: {{sentence2}} True or False?\nAnswer:" doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True or False?\nAnswer:"
doc_to_target: label doc_to_target: label
doc_to_choice: ['True', 'False'] doc_to_choice: ['True', 'False']
metric_list: metric_list:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment