Commit 4bff76d5 authored by lintangsutawika's avatar lintangsutawika
Browse files

fixed stderr for metrics like brier_score

parent 5da401b0
...@@ -487,16 +487,24 @@ def evaluate( ...@@ -487,16 +487,24 @@ def evaluate(
# For unweighted averaging, use: # For unweighted averaging, use:
# current_size = 1 # current_size = 1
# TODO: Tasks like brier score for individual
# tasks have no stderr since the score is
# itself an aggregation. But it's possible to
# calculate the stderr over groups
all_stderr = [] all_stderr = []
for metric in [ for metric in [
key for key in metrics.keys() if "_stderr" not in key key for key in metrics.keys() if "_stderr" not in key
]: ]:
stderr = "_stderr,".join(metric.split(",")) stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr] stderr_score = results[task][stderr]
var_score = stderr_score**2 if stderr_score == "N/A":
metric_score = results[task][metric] var_score = "N/A"
else:
var_score = stderr_score**2
all_stderr.append(stderr)
all_stderr.append(stderr) metric_score = results[task][metric]
if metric in results[group]: if metric in results[group]:
results[group][metric] = ( results[group][metric] = (
...@@ -504,17 +512,20 @@ def evaluate( ...@@ -504,17 +512,20 @@ def evaluate(
+ metric_score * current_size + metric_score * current_size
) / (total_size + current_size) ) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$ # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
results[group][stderr] = ( if var_score == "N/A":
(total_size - 1) * results[group][stderr] results[group][stderr] = "N/A"
+ (current_size - 1) * var_score else:
) / ( results[group][stderr] = (
total_size + current_size - 1 (total_size - 1) * results[group][stderr]
) + total_size * current_size / ( + (current_size - 1) * var_score
(total_size + current_size) ) / (
* (total_size + current_size - 1) total_size + current_size - 1
) * ( ) + total_size * current_size / (
results[group][metric] - metric_score (total_size + current_size)
) ** 2 * (total_size + current_size - 1)
) * (
results[group][metric] - metric_score
) ** 2
else: else:
results[group][metric] = metric_score results[group][metric] = metric_score
results[group][stderr] = var_score results[group][stderr] = var_score
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment