merged updated big-refactor

7d16a7cd · lintangsutawika · 691b4ca9 · a005aeba · 7d16a7cd · 7d16a7cd
Commit 7d16a7cd authored Aug 23, 2023 by lintangsutawika
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 9 deletions

README.md README.md +1 -1

lm_eval/api/task.py lm_eval/api/task.py +3 -3

lm_eval/evaluator.py lm_eval/evaluator.py +5 -5

No files found.
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ This project provides a unified framework to test generative language models on
 Features:
- Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md).
+- Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
 - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
 - Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -659,14 +659,14 @@ class ConfigurableTask(Task):
            self.multiple_target = len(test_target)
        else:
            if (type(test_target) is int) and (test_choice is not None):
-                test_target = [test_choice[test_target]]
+                test_target = test_choice[test_target]
            else:
-                test_target = [test_target]
+                test_target = str(test_target)
        if test_choice is not None:
            check_choices = test_choice
        else:
-            check_choices = test_target
+            check_choices = [test_target]
        for choice in check_choices:
            choice_has_whitespace = True if " " in choice else False

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -219,7 +219,6 @@ def evaluate(
    padding_requests = collections.defaultdict(int)
    # Stores group related keys and values for group-aggregation
-    aggregate = collections.defaultdict(dict)
    task_groups = collections.defaultdict(dict)
    # get lists of each type of request
@@ -228,6 +227,7 @@ def evaluate(
        if type(task) == tuple:
            group, task = task
            task_groups[task_name] = group
+            aggregate[task_name] = {}
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())
@@ -407,12 +407,12 @@ def evaluate(
            #        | word_perplexity
            #        | byte_perplexity
            #        | bits_per_byte
-            if bool(task_groups):
+            if task_name in task_groups:
                group_name = task_groups[task_name]
-                if metric not in aggregate[group_name]:
+                if metric in list(aggregate[group_name].keys()):
-                    aggregate[group_name][metric] = [task_score]
-                else:
                    aggregate[group_name][metric].append(task_score)
+                else:
+                    aggregate[group_name][metric] = [task_score]
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this