add weight_by_size config

e2bfdf3b · lintangsutawika · 45a8f709 · e2bfdf3b · e2bfdf3b
Commit e2bfdf3b authored Jan 23, 2024 by lintangsutawika
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 7 deletions

lm_eval/api/task.py lm_eval/api/task.py +1 -0

lm_eval/evaluator.py lm_eval/evaluator.py +6 -7

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -80,6 +80,7 @@ class TaskConfig(dict):
    filter_list: Union[str, list] = None
    should_decontaminate: bool = False
    doc_to_decontamination_query: str = None
+    weight_by_size: bool = False
    metadata: Union[
        str, list

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -123,7 +123,7 @@ def simple_evaluate(
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
-            group, task_obj = task_obj
+            _, task_obj = task_obj
            if task_obj is None:
                continue
@@ -484,12 +484,11 @@ def evaluate(
                        if "alias" in metrics:
                            metrics.pop("alias")
-                        current_size = metrics.pop("samples")
+                        if configs[task]["weight_by_size"]:
-                        # TODO: There should be a way for users
+                            current_size = metrics.pop("samples")
-                        #       to toggle between weighted and
+                        else:
-                        #       unweighted averaging
+                            metrics.pop("samples")
-                        # For unweighted averaging, use:
+                            current_size = 1
-                        #     current_size = 1
                        all_stderr = []
                        for metric in [