Commit 00859825 authored by lintangsutawika's avatar lintangsutawika
Browse files

added stderr reprocessing for groups

parent 1dc8f96f
......@@ -219,7 +219,6 @@ def evaluate(
padding_requests = collections.defaultdict(int)
# store the hierarchy to do proper ordering
task_hierarchy = collections.defaultdict(list)
group_hierarchy = collections.defaultdict(list)
# store the ordering of tasks and groups
task_order = collections.defaultdict(int)
# store the aggregation for aggregating across tasks in the same group
......@@ -450,27 +449,8 @@ def evaluate(
group_name = None
agg_fn = task.aggregation()[metric]
task_score = agg_fn(items)
task_size = len(items)
# if group_name is not None:
# sample_metric_key = metric + "(sample agg)," + key
# for grouping in task_to_group[task_name]:
# if metric_key in results[grouping]:
# results[grouping][metric_key].append(task_score)
# results[grouping]["size"].append(task_size)
# else:
# results[grouping][metric_key] = [task_score]
# results[grouping]["size"] = [task_size]
# if sample_metric_key in results[grouping]:
# results[grouping][sample_metric_key] += items
# else:
# results[grouping][sample_metric_key] = items.copy()
# sample_agg_fn[grouping][sample_metric_key] = agg_fn
results[task_name][metric_key] = task_score
results[task_name]["size"] = task_size
results[task_name][metric_key] = agg_fn(items)
results[task_name]["samples"] = len(items)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
......@@ -494,28 +474,29 @@ def evaluate(
for task in task_list:
metrics = results[task]
if "size" in metrics:
current_size = metrics.pop("size")
else:
current_size = 1
for metric in [key for key in metrics.keys()]:
current_size = metrics.pop("samples")
# if "size" in metrics:
# current_size = metrics.pop("size")
# else:
# current_size = 1
if "_stderr" in metric:
print(metric)
for metric in [key for key in metrics.keys() if "_stderr" not in key]:
stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr]
metric_score = results[task][metric]
if metric in results[group]:
results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
results[group][stderr] = ((total_size-1)*results[group][stderr]+(current_size-1)*stderr_score)/(total_size + current_size - 1) \
+ total_size*current_size/((total_size+current_size)*(total_size+current_size-1))*(results[group][metric] - metric_score)**2
else:
results[group][metric] = metric_score
# Different formula for agg stderr
results[group][stderr] = stderr_score
total_size += current_size
results[group]["samples"] = total_size
for task_name, task in task_dict.items():
if type(task) == tuple:
......
......@@ -11,6 +11,3 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment