pre-commit

b8122d98 · lintangsutawika · 01b129bb · b8122d98 · b8122d98 · b8122d98
Commit b8122d98 authored Aug 05, 2024 by lintangsutawika
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 20 deletions

lm_eval/api/samplers.py lm_eval/api/samplers.py +22 -11

lm_eval/evaluator.py lm_eval/evaluator.py +6 -6

lm_eval/evaluator_utils.py lm_eval/evaluator_utils.py +3 -3

No files found.
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
-import datasets
 from functools import partial
+import datasets
 class ContextSampler:
    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
@@ -15,27 +17,36 @@ class ContextSampler:
        self.target_delimiter = self.config.target_delimiter
        self.fewshot_delimiter = self.config.fewshot_delimiter
-        if self.config.fewshot_config is not None and self.config.fewshot_config.get("doc_to_text", None) is not None:
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
            self.doc_to_text = partial(
                self.task.doc_to_text,
-                doc_to_text=self.config.fewshot_config.get("doc_to_text", None)
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
-                )
+            )
        else:
            self.doc_to_text = self.task.doc_to_text
-        if self.config.fewshot_config is not None and self.config.fewshot_config.get("doc_to_target", None) is not None:
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
            self.doc_to_target = partial(
                self.task.doc_to_target,
-                doc_to_target=self.config.fewshot_config.get("doc_to_target", None)
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
-                )
+            )
        else:
            self.doc_to_target = self.task.doc_to_target
-        if self.config.fewshot_config is not None and self.config.fewshot_config.get("doc_to_choice", None) is not None:
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
            self.doc_to_choice = partial(
                self.task.doc_to_choice,
-                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None)
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
-                )
+            )
        else:
            self.doc_to_choice = self.task.doc_to_choice
@@ -72,7 +83,7 @@ class ContextSampler:
                else self.doc_to_choice(doc)[doc_content]
            )
            labeled_examples += self.target_delimiter
-            if doc_target is not "":
+            if doc_target != "":
                labeled_examples += (
                    str(doc_target[0])
                    if isinstance(doc_target, list)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -608,16 +608,16 @@ def evaluate(
                    ]
                    # compute group's pooled metric and stderr
-                    results[group][
+                    results[group][metric] = (
-                        metric
+                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    )
                    # TODO: calculate grouped metric using aggregation fn
                    if "N/A" in stderrs:
                        results[group][stderr] = "N/A"
                    else:
-                        results[group][
+                        results[group][stderr] = (
-                            stderr
+                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        )
                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -275,9 +275,9 @@ def consolidate_results(
                metric_key
            ]
            results[task_output.task_name]["samples"] = task_output.sample_len
-            results[task_output.task_name][
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
-                f"{metric}_stderr,{filter_key}"
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
-            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
    return results, samples, configs, versions, num_fewshot, higher_is_better