Commit 4bff76d5 authored by lintangsutawika's avatar lintangsutawika
Browse files

fixed stderr for metrics like brier_score

parent 5da401b0
...@@ -487,23 +487,34 @@ def evaluate( ...@@ -487,23 +487,34 @@ def evaluate(
# For unweighted averaging, use: # For unweighted averaging, use:
# current_size = 1 # current_size = 1
# TODO: Tasks like brier score for individual
# tasks have no stderr since the score is
# itself an aggregation. But it's possible to
# calculate the stderr over groups
all_stderr = [] all_stderr = []
for metric in [ for metric in [
key for key in metrics.keys() if "_stderr" not in key key for key in metrics.keys() if "_stderr" not in key
]: ]:
stderr = "_stderr,".join(metric.split(",")) stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr] stderr_score = results[task][stderr]
if stderr_score == "N/A":
var_score = "N/A"
else:
var_score = stderr_score**2 var_score = stderr_score**2
metric_score = results[task][metric]
all_stderr.append(stderr) all_stderr.append(stderr)
metric_score = results[task][metric]
if metric in results[group]: if metric in results[group]:
results[group][metric] = ( results[group][metric] = (
results[group][metric] * total_size results[group][metric] * total_size
+ metric_score * current_size + metric_score * current_size
) / (total_size + current_size) ) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$ # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
if var_score == "N/A":
results[group][stderr] = "N/A"
else:
results[group][stderr] = ( results[group][stderr] = (
(total_size - 1) * results[group][stderr] (total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score + (current_size - 1) * var_score
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment