minor fixes to satisify pre-commit

400c0199 · lintangsutawika · 2ee7121b · 400c0199 · 400c0199 · 400c0199
Commit 400c0199 authored Jun 15, 2023 by lintangsutawika
20 changed files
--- a/docs/PROGRESS.md
+++ b/docs/PROGRESS.md
--- a/docs/advanced_task_guide.md
+++ b/docs/advanced_task_guide.md
@@ -19,7 +19,7 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields
 - **reference** (`str`, *optional*) —
 - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
 - **dataset_name**  (`str`, *optional*, defaults to None) — The name of, what HF calls, a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
- **dataset_kwargs** (`dict`, *optional*) — Auxillary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
+- **dataset_kwargs** (`dict`, *optional*) — Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
 - **training_split** (`str`, *optional*) — Split in the dataset to use as the training split.
 - **validation_split** (`str`, *optional*) — Split in the dataset to use as the validation split.
 - **test_split** (`str`, *optional*) — Split in the dataset to use as the test split.
@@ -169,7 +169,7 @@ You can find an example of how to use this feature at [gsm8k-cot-self-consistenc
 ## Passing Arguments to Metrics
-Metrics can be defined in the `metric_list` argument when building the YAML config. Multiple metrics can be listed along with any auxillary arguments. For example, setting the [`exact_match` metric](https://github.com/huggingface/evaluate/tree/main/metrics/exact_match), auxiliary arguments such as `ignore_case`, `ignore_punctuation`, `regexes_to_ignore` can be listed as well. They will be added to the metric function as `kwargs`. Some metrics have predefined values for `aggregation` and `higher_is_better` so listing the metric name only can be sufficient.
+Metrics can be defined in the `metric_list` argument when building the YAML config. Multiple metrics can be listed along with any auxiliary arguments. For example, setting the [`exact_match` metric](https://github.com/huggingface/evaluate/tree/main/metrics/exact_match), auxiliary arguments such as `ignore_case`, `ignore_punctuation`, `regexes_to_ignore` can be listed as well. They will be added to the metric function as `kwargs`. Some metrics have predefined values for `aggregation` and `higher_is_better` so listing the metric name only can be sufficient.
 ```
 metric_list:
@@ -225,4 +225,3 @@ Generative tasks:
 Tasks using complex filtering:
 - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -250,4 +250,3 @@ It is recommended to include a filled-out copy of this checklist in the README.m
 ## Submitting your task
 You're all set! Now push your work and make a pull request to the `big-refactor` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!
--- a/examples/README.md
+++ b/examples/README.md
--- a/examples/chain_of_thought/README.md
+++ b/examples/chain_of_thought/README.md
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -29,7 +29,9 @@ def get_model(model_name):
    try:
        return MODEL_REGISTRY[model_name]
    except KeyError:
-        raise ValueError(f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}")
+        raise ValueError(
+            f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}"
+        )
 TASK_REGISTRY = {}
@@ -75,10 +77,7 @@ DEFAULT_METRIC_REGISTRY = {
        "acc",
    ],
    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
-    "multiple_choice": [
+    "multiple_choice": ["acc", "acc_norm"],
-        "acc",
-        "acc_norm"
-    ],
    "greedy_until": ["exact_match"],
 }
@@ -136,7 +135,6 @@ searching in HF Evaluate library..."
 def register_aggregation(name):
    def decorate(fn):
        assert (
            name not in AGGREGATION_REGISTRY

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -98,7 +98,9 @@ class TaskConfig(dict):
                self.gold_alias = self.template_aliases + self.doc_to_target
        if self.generation_kwargs or self.output_type == "greedy_until":
-            assert self.output_type == "greedy_until", "passed `generation_kwargs`, but not using a generation request type!"
+            assert (
+                self.output_type == "greedy_until"
+            ), "passed `generation_kwargs`, but not using a generation request type!"
            # ensure that we greedily generate in absence of explicit arguments otherwise
            self.generation_kwargs = {"do_sample": False, "temperature": 0.0}
@@ -546,7 +548,7 @@ class ConfigurableTask(Task):
                }
                try:
                    self._metric_fn_list[metric_name] = METRIC_REGISTRY[metric_name]
-                except:
+                except Exception:
                    eval_logger.warning(
                        f"Metric {metric_name} not found, "
                        "Searching from https://huggingface.co/evaluate-metric"
@@ -606,9 +608,7 @@ class ConfigurableTask(Task):
                    filter_pipeline = build_filter_ensemble(filter_name, components)
                self._filters.append(filter_pipeline)
        else:
-            self._filters = [
+            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
-                build_filter_ensemble("none", [["take_first", None]])
-            ]
        if self._config.use_prompt is not None:
            eval_logger.info(f"loading prompt {self._config.use_prompt}")

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -150,7 +150,9 @@ def evaluate(
    # get lists of each type of request
    for task_name, task in task_dict.items():
        versions[task_name] = task.VERSION
-        configs[task_name] = dict(task.dump_config()) # TODO: don't access a private attribute here ; for non-YAML tasks handle this case
+        configs[task_name] = dict(
+            task.dump_config()
+        )  # TODO: don't access a private attribute here ; for non-YAML tasks handle this case
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
        # task_docs = list(task_doc_func())
@@ -290,7 +292,11 @@ def evaluate(
            if stderr is not None:
                results[task_name][metric + "_stderr" + "," + key] = stderr(items)
-        return {"results": dict(results), "configs": dict(configs), "versions": dict(versions)}
+        return {
+            "results": dict(results),
+            "configs": dict(configs),
+            "versions": dict(versions),
+        }
    else:
        return None
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -63,7 +63,7 @@ def get_task(task_name, config):
        return TASK_REGISTRY[task_name](config=config)
    except KeyError:
        eval_logger.info("Available tasks:")
-        eval_logger.info(ALL_TASKS)
+        eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
        raise KeyError(f"Missing task {task_name}")

--- a/lm_eval/tasks/gsm8k/README.md
+++ b/lm_eval/tasks/gsm8k/README.md
--- a/lm_eval/tasks/pile/pile_europarl.yaml
+++ b/lm_eval/tasks/pile/pile_europarl.yaml
--- a/lm_eval/tasks/pile/pile_gutenberg.yaml
+++ b/lm_eval/tasks/pile/pile_gutenberg.yaml
--- a/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
 include: pile_arxiv.yaml
 task: pile_pubmed-abstracts
 dataset_name: pile_pubmed-abstracts
--- a/lm_eval/tasks/pile/pile_pubmed-central.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-central.yaml
 include: pile_arxiv.yaml
 task: pile_pubmed-central
 dataset_name: pile_pubmed-central
--- a/lm_eval/tasks/pile/pile_stackexchange.yaml
+++ b/lm_eval/tasks/pile/pile_stackexchange.yaml
 include: pile_arxiv.yaml
 task: pile_stackexchange
 dataset_name: pile_stackexchange
--- a/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
+++ b/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
 include: pile_arxiv.yaml
 task: pile_ubuntu-irc
 dataset_name: pile_ubuntu-irc
--- a/lm_eval/tasks/pile/pile_uspto.yaml
+++ b/lm_eval/tasks/pile/pile_uspto.yaml
 include: pile_arxiv.yaml
 task: pile_uspto
 dataset_name: pile_uspto
--- a/lm_eval/tasks/pile/pile_wikipedia.yaml
+++ b/lm_eval/tasks/pile/pile_wikipedia.yaml
 include: pile_arxiv.yaml
 task: pile_wikipedia
 dataset_name: pile_wikipedia
--- a/lm_eval/tasks/wikitext/README.md
+++ b/lm_eval/tasks/wikitext/README.md