aggregate is shown in the table

32a70d89 · lintangsutawika · 1d995b6d · 32a70d89
Commit 32a70d89 authored Jul 17, 2023 by lintangsutawika
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 1 deletion

lm_eval/evaluator.py lm_eval/evaluator.py +11 -1

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -364,6 +364,15 @@ def evaluate(
            task_score = task.aggregation()[metric](items)
            results[task_name][metric + "," + key] = task_score

+            # if task_name not in benchmark_agg:
+            #     benchmark[] = [task_score]
+
+            # Need to put back in results
+            # pythia | acc
+            #        | perplexity
+            #        | word_perplexity
+            #        | byte_perplexity
+            #        | bits_per_byte
            if metric not in aggregate:
                aggregate[metric] = [task_score]
            else:
@@ -383,7 +392,8 @@ def evaluate(
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)

        for metric in aggregate.keys():
-            aggregate[metric] = np.average(aggregate[metric])
+            results["Aggregate"][metric] = np.average(aggregate[metric])
+            versions["Aggregate"] = "N/A"

        results_dict = {
            "results": dict(results),