Commit 00859825 authored by lintangsutawika's avatar lintangsutawika
Browse files

added stderr reprocessing for groups

parent 1dc8f96f
...@@ -219,7 +219,6 @@ def evaluate( ...@@ -219,7 +219,6 @@ def evaluate(
padding_requests = collections.defaultdict(int) padding_requests = collections.defaultdict(int)
# store the hierarchy to do proper ordering # store the hierarchy to do proper ordering
task_hierarchy = collections.defaultdict(list) task_hierarchy = collections.defaultdict(list)
group_hierarchy = collections.defaultdict(list)
# store the ordering of tasks and groups # store the ordering of tasks and groups
task_order = collections.defaultdict(int) task_order = collections.defaultdict(int)
# store the aggregation for aggregating across tasks in the same group # store the aggregation for aggregating across tasks in the same group
...@@ -450,27 +449,8 @@ def evaluate( ...@@ -450,27 +449,8 @@ def evaluate(
group_name = None group_name = None
agg_fn = task.aggregation()[metric] agg_fn = task.aggregation()[metric]
task_score = agg_fn(items) results[task_name][metric_key] = agg_fn(items)
task_size = len(items) results[task_name]["samples"] = len(items)
# if group_name is not None:
# sample_metric_key = metric + "(sample agg)," + key
# for grouping in task_to_group[task_name]:
# if metric_key in results[grouping]:
# results[grouping][metric_key].append(task_score)
# results[grouping]["size"].append(task_size)
# else:
# results[grouping][metric_key] = [task_score]
# results[grouping]["size"] = [task_size]
# if sample_metric_key in results[grouping]:
# results[grouping][sample_metric_key] += items
# else:
# results[grouping][sample_metric_key] = items.copy()
# sample_agg_fn[grouping][sample_metric_key] = agg_fn
results[task_name][metric_key] = task_score
results[task_name]["size"] = task_size
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this # so we run them less iterations. still looking for a cleaner way to do this
...@@ -494,28 +474,29 @@ def evaluate( ...@@ -494,28 +474,29 @@ def evaluate(
for task in task_list: for task in task_list:
metrics = results[task] metrics = results[task]
if "size" in metrics: current_size = metrics.pop("samples")
current_size = metrics.pop("size") # if "size" in metrics:
else: # current_size = metrics.pop("size")
current_size = 1 # else:
# current_size = 1
for metric in [key for key in metrics.keys()]: for metric in [key for key in metrics.keys() if "_stderr" not in key]:
if "_stderr" in metric:
print(metric)
stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr]
metric_score = results[task][metric] metric_score = results[task][metric]
if metric in results[group]: if metric in results[group]:
results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size) results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
results[group][stderr] = ((total_size-1)*results[group][stderr]+(current_size-1)*stderr_score)/(total_size + current_size - 1) \
+ total_size*current_size/((total_size+current_size)*(total_size+current_size-1))*(results[group][metric] - metric_score)**2
else: else:
results[group][metric] = metric_score results[group][metric] = metric_score
results[group][stderr] = stderr_score
# Different formula for agg stderr
total_size += current_size total_size += current_size
results[group]["samples"] = total_size
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if type(task) == tuple: if type(task) == tuple:
......
...@@ -11,6 +11,3 @@ metric_list: ...@@ -11,6 +11,3 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment