pre-commit stuff

c4c20ff5 · lintangsutawika · e56b950a · c4c20ff5 · e56b950a · c4c20ff5
Commit c4c20ff5 authored May 19, 2023 by lintangsutawika
20 changed files
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ python main.py \
    --device cuda:0
 ```
-Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partialy trained checkpoints:
+Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints:
 ```bash
 python main.py \
@@ -64,8 +64,8 @@ To use with [PEFT](https://github.com/huggingface/peft), take the call you would
 python main.py \
    --model hf-causal \
    --model_args pretrained=EleutherAI/gpt-j-6b,peft=nomic-ai/gpt4all-j-lora \
-    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \ 
+    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
-    --device cuda:0 
+    --device cuda:0
 ```
 Our library also supports the OpenAI API:
@@ -78,7 +78,7 @@ python main.py \
    --tasks lambada_openai,hellaswag
 ```
-While this functionality is only officially mantained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.
+While this functionality is only officially maintained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.
 To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
@@ -116,7 +116,7 @@ When reporting eval harness results, please also report the version of each task
 ## Test Set Decontamination
-To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model trainign set. Unfortunately, outside of models trained on the Pile ans C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).
+To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model trainign set. Unfortunately, outside of models trained on the Pile and C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).
 For details on text decontamination, see the [decontamination guide](./docs/decontamination.md).

--- a/examples/configurable_task/sglue_cb.yaml
+++ b/examples/configurable_task/sglue_cb.yaml
-dataset_path: super_glue
-dataset_name: cb
-training_split: train
-validation_split: validation
-template_aliases: "{% set hypo = hypothesis %}"
-doc_to_text: "Suppose {{premise}} Can we infer that \"{{hypo}}\"? Yes, no, or maybe?"
-doc_to_target: "{% set answer_choices = ['Yes', 'No', 'Maybe'] %}{{answer_choices[label]}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
--- a/lm_eval/api/__init__.py
+++ b/lm_eval/api/__init__.py
 from . import metrics
\ No newline at end of file
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -3,9 +3,10 @@ from typing import List
 from lm_eval.api.instance import Instance
 class Filter:
    """
-    Filter classes operate on a per-task level. 
+    Filter classes operate on a per-task level.
    They take all model outputs (`instance.resps` for all `task.instances`)
    across all instances of a task, and perform operations.
    In a single run, one can configure any number of separate filters or lists of filters.
@@ -25,30 +26,33 @@ class Filter:
        [<filtered resps for instance 0>, <filtered resps for instance 1>]
        """
        return resps
 @dataclass
 class FilterEnsemble:
    """
    FilterEnsemble creates a pipeline applying multiple filters.
-    Its intended usage is to stack multiple post-processing steps in order. 
+    Its intended usage is to stack multiple post-processing steps in order.
-    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each 
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
    pipeline separately.
    """
-    name: str 
+    name: str
    filters: List[Filter]
    def apply(self, instances: List[Instance]):
-        resps = [inst.resps for inst in instances] # operate just on the model responses
+        resps = [
+            inst.resps for inst in instances
+        ]  # operate just on the model responses
        for f in self.filters:
            # apply filters in sequence
            out = f.apply(resps)
-            resps = out # TODO: handle the case where a filter returns multiple "buckets"
+            resps = (
+                out  # TODO: handle the case where a filter returns multiple "buckets"
+            )
        # add the end results after filtering to filtered_requests of their respective source instances.
        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
        for inst, resp in zip(instances, resps):
            inst.filtered_resps[self.name] = resp
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
 from dataclasses import dataclass, field
 from typing import Literal, Tuple
 @dataclass
 class Instance:
-    request_type: str = Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
+    request_type: str = Literal[
+        "loglikelihood", "loglikelihood_rolling", "greedy_until"
+    ]
    doc: dict = None
    arguments: tuple = None
    idx: int = None
-    metadata: tuple = Tuple[str, int, int] # TODO: better typehints here
+    metadata: tuple = Tuple[str, int, int]  # TODO: better typehints here
    resps: list = field(default_factory=list)
    filtered_resps: dict = field(default_factory=dict)
@@ -19,10 +22,12 @@ class Instance:
    def __post_init__(self):
        # unpack metadata field
        self.task_name, self.doc_id, self.repeats = self.metadata
    @property
    def args(self):
        """
        Returns (string,) where `string` is the string to calculate loglikelihood over
        """
-        return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        return (
+            self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        )
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -26,7 +26,6 @@ HIGHER_IS_BETTER_REGISTRY = {
    "bleu": True,
    "chrf": True,
    "ter": False,
    "acc": True,
    "acc_norm": True,
    "acc_mutual_info": True,
@@ -35,6 +34,7 @@ HIGHER_IS_BETTER_REGISTRY = {
    "bits_per_byte": False,
 }
 def register_metric(name):
    # TODO: do we want to enforce a certain interface to registered metrics?
    def decorate(fn):
@@ -44,7 +44,7 @@ def register_metric(name):
        METRIC_REGISTRY[name] = fn
        return fn
    return decorate
@@ -54,12 +54,14 @@ def get_metric(name):
        return METRIC_REGISTRY[name]
    except KeyError:
        # TODO: change this print to logging?
-        print(f"Could not find registered metric '{name}' in lm-eval, \
+        print(
-searching in HF Evaluate library...")
+            f"Could not find registered metric '{name}' in lm-eval, \
+searching in HF Evaluate library..."
+        )
        try:
            metric_object = evaluate.load(name)
            return metric_object.compute
-        except:
+        except Exception:
            raise Warning(
                "{} not found in the evaluate library!".format(name),
                "Please check https://huggingface.co/evaluate-metric",
@@ -75,7 +77,7 @@ def register_aggregation(name):
        AGGREGATION_REGISTRY[name] = fn
        return fn
    return decorate

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -6,14 +6,15 @@ from lm_eval import utils
 MODEL_REGISTRY = {}
 def register_model(*names):
    # either pass a list or a single alias.
    # function receives them as a tuple of strings
    def decorate(cls):
-        for name in names: 
+        for name in names:
-            assert (
+            assert issubclass(
-                issubclass(cls, LM)
+                cls, LM
            ), f"Model '{name}' ({cls.__name__}) must extend LM class"
            assert (
@@ -22,7 +23,7 @@ def register_model(*names):
            MODEL_REGISTRY[name] = cls
        return cls
    return decorate

--- a/lm_eval/api/register.py
+++ b/lm_eval/api/register.py
@@ -5,6 +5,7 @@ group_registry = {}
 task2func_index = {}
 func2task_index = {}
 def register_task(name):
    def wrapper(func):
@@ -15,16 +16,16 @@ def register_task(name):
    return wrapper
 def register_group(name):
    def wrapper(func):
        func_name = func2task_index[func.__name__]
        if name in group_registry:
-            group_registry[name].append(
+            group_registry[name].append(func_name)
-                func_name
-                )
        else:
            group_registry[name] = [func_name]
        return func
    return wrapper
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
 class Sampler:
    def __init__(self, docs, task, fewshot_indices=None, rnd=None):
        self.rnd = rnd
@@ -12,15 +9,18 @@ class Sampler:
        self.delimiter = self.config.delimiter
-        self.docs = docs # HF dataset split, provided by task._fewshot_docs()
+        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
-        if fewshot_indices: # subset few-shot docs from 
+        if fewshot_indices:  # subset few-shot docs from
            self.docs = self.docs.select(fewshot_indices)
    def get_context(self, doc, num_fewshot):
-        # draw an extra fewshot sample if using same split as evaluting on
+        # draw an extra fewshot sample if using same split as evaluating on
-        n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot 
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
        # draw `n_samples` docs from fewshot_docs
        fewshotex = self.sample(n_samples)
@@ -28,16 +28,16 @@ class Sampler:
        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
        # TODO: should we just stop people from using fewshot from same split as evaluating?
        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
        labeled_examples = (
-                self.delimiter.join(
+            self.delimiter.join(
-                    [
+                [
-                        self.task.doc_to_text(doc) + self.task.doc_to_target(doc)
+                    self.task.doc_to_text(doc) + self.task.doc_to_target(doc)
-                        for doc in selected_docs
+                    for doc in selected_docs
-                    ]
+                ]
-                )
-                + self.delimiter
            )
+            + self.delimiter
+        )
        # only returns the fewshot context! Does not append the document, do this outside the object
        return labeled_examples
@@ -51,25 +51,22 @@ class Sampler:
 class BalancedSampler(Sampler):
    def sample(self, n):
        """
-        TODO: this should return approximately class-balanced samples from our fewshot examples. 
+        TODO: this should return approximately class-balanced samples from our fewshot examples.
        TODO: what order should they be in? maybe random?
        """
        pass
-class ManualSampler(Sampler):
+class ManualSampler(Sampler):
    def sample(self, n):
-        """
+        """ """
+        pass
-        """
-        pass 
-# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init. 
+# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
 # Depends what's easier for new user to add own functionality on top of
 # types of sampler:

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -10,7 +10,12 @@ import lm_eval.api.metrics
 import lm_eval.tasks
 import lm_eval.models
-from lm_eval.utils import positional_deprecated, run_task_tests, make_table, get_git_commit_hash
+from lm_eval.utils import (
+    positional_deprecated,
+    run_task_tests,
+    make_table,
+    get_git_commit_hash,
+)
 from lm_eval.logger import eval_logger
@@ -127,20 +132,20 @@ def evaluate(
        Dictionary of results
    """
-    decontaminate = decontamination_ngrams_path is not None
+    # decontaminate = decontamination_ngrams_path is not None
    results = collections.defaultdict(dict)
    versions = collections.defaultdict(dict)
    requests = collections.defaultdict(list)
-    requests_origin = collections.defaultdict(list)
+    # requests_origin = collections.defaultdict(list)
-    docs = {}
+    # docs = {}
    # get lists of each type of request
    for task_name, task in task_dict.items():
        versions[task_name] = task.VERSION
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
        # task_docs = list(task_doc_func())
        # rnd = random.Random()
@@ -150,9 +155,13 @@ def evaluate(
        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
        task.build_all_requests(limit=limit)
        # aggregate Instances by LM method requested to get output.
-        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
+        reqtype = (
-        requests[reqtype].extend(task.instances) 
+            "loglikelihood"
+            if task.OUTPUT_TYPE == "multiple_choice"
+            else task.OUTPUT_TYPE
+        )  # TODO: this is hacky, fix in task.py
+        requests[reqtype].extend(task.instances)
    ### Run LM on inputs, get all outputs ###
    # execute each type of request
    for reqtype, reqs in requests.items():
@@ -161,7 +170,7 @@ def evaluate(
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)
@@ -175,7 +184,7 @@ def evaluate(
        task.apply_filters()
    ### Collect values of metrics on all datapoints ###
-    # TODO: make metric configurable, add metric registry 
+    # TODO: make metric configurable, add metric registry
    vals = collections.defaultdict(list)
    # unpack results and sort back in order and return control to Task
@@ -183,11 +192,17 @@ def evaluate(
        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
        # TODO: make it possible to use a different metric per key
        for key in task.instances[0].filtered_resps.keys():
-            for doc_id, doc in enumerate(itertools.islice(task.test_docs(), 0, limit) if task.has_test_docs() else task.validation_docs()):
+            for doc_id, doc in enumerate(
+                itertools.islice(task.test_docs(), 0, limit)
+                if task.has_test_docs()
+                else task.validation_docs()
+            ):
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
                requests.sort(key=lambda x: x.idx)
-                metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests])
+                metrics = task.process_results(
+                    doc, [req.filtered_resps[key] for req in requests]
+                )
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
@@ -195,7 +210,9 @@ def evaluate(
    # aggregate results ; run bootstrap CIs
    for (task_name, key, metric), items in vals.items():
        task = task_dict[task_name]
-        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](items)
+        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](
+            items
+        )
        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -6,10 +6,10 @@ from . import extraction
 FILTER_REGISTRY = {
    "take_first": selection.TakeFirstFilter,
    "regex": extraction.RegexFilter,
-    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function 
+    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
-    # that takes an input and returns a scalar and then should select the max reward, 
+    # that takes an input and returns a scalar and then should select the max reward,
    # or should implement different filters for different ways of handling a reward model's inference.
-    #"arg_max": selection.ArgMaxFilter, 
+    # "arg_max": selection.ArgMaxFilter,
 }
@@ -24,11 +24,11 @@ def build_filter_ensemble(filter_name, components):
    filters = []
    for (function, kwargs) in components:
-        if kwargs == None:
+        if kwargs is None:
            f = get_filter(function)()
        else:
            # create a filter given its name in the registry
-            f = get_filter(function)(**kwargs) # TODO: pass kwargs to filters properly
+            f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly
        # add the filter as a pipeline step
        filters.append(f)

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
@@ -4,7 +4,7 @@ from lm_eval.api.filter import Filter
 class DecontaminationFilter(Filter):
    """
-    A filter which evaluates 
+    A filter which evaluates
    """
    name = "track_decontamination"
@@ -12,7 +12,7 @@ class DecontaminationFilter(Filter):
    def __init__(self, path):
        """
-        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). 
+        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
        should further cache result on a given (task_name, doc_id)
        """
        self._decontam_results = None
@@ -21,4 +21,4 @@ class DecontaminationFilter(Filter):
        """
        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
        """
        pass
\ No newline at end of file
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -4,10 +4,7 @@ from lm_eval.api.filter import Filter
 class RegexFilter(Filter):
-    """
+    """ """
-    """
    def __init__(self, regex_pattern=r"#### (\-?[0-9\.\,]+)", fallback="[invalid]"):
        """
@@ -20,7 +17,7 @@ class RegexFilter(Filter):
    def apply(self, resps):
        # here, we assume we have a list, in which each element is
-        # a list of model responses for some particular input/target pair. 
+        # a list of model responses for some particular input/target pair.
        # so we process each of these (same input/target response sets)
        # independently (and keep them a list.)
        def filter_set(inst):

--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
 from lm_eval.api.filter import Filter
-class TakeFirstFilter:
+class TakeFirstFilter:
    def __init__(self):
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
@@ -11,4 +11,4 @@ class TakeFirstFilter:
        """
        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
        """
        return map(lambda r: r[0], resps)
\ No newline at end of file
--- a/lm_eval/logger.py
+++ b/lm_eval/logger.py
 import logging
 logging.basicConfig(
-    format='%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
+    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
-    datefmt='%Y-%m-%d:%H:%M:%S',
+    datefmt="%Y-%m-%d:%H:%M:%S",
-    level=logging.INFO
+    level=logging.INFO,
-    )
+)
 eval_logger = logging.getLogger("lm-eval")
\ No newline at end of file
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -111,7 +111,11 @@ class HFLM(LM):
    def _model_generate(self, context, max_length, eos_token_id):
        return self.gpt2.generate(
-            context, max_length=max_length, pad_token_id=eos_token_id, eos_token_id=eos_token_id, do_sample=False
+            context,
+            max_length=max_length,
+            pad_token_id=eos_token_id,
+            eos_token_id=eos_token_id,
+            do_sample=False,
        )
    def loglikelihood(self, requests):

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -2,46 +2,44 @@ from lm_eval.logger import eval_logger
 from promptsource.templates import DatasetTemplates
 # TODO: decide whether we want jinja2 or f-string prompts. would it be cursed to support both?
-# Prompt library. 
+# Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.
 # This allows us to access prompts
 PROMPT_REGISTRY = {
    "qa-basic": {
        "question-newline-answer": "Question: {{question}}\nAnswer:",
-        "q-newline-a": "Q: {{question}}\nA:"
+        "q-newline-a": "Q: {{question}}\nA:",
    },
 }
 def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
-    # unpack prompt name 
+    # unpack prompt name
    category_name, prompt_name = prompt_id.split(":")
-    eval_logger.info(
+    eval_logger.info(f"Loading prompt from {category_name}")
-        f"Loading prompt from {category_name}"
-        )
    if category_name == "promptsource":
        try:
            # prompts = DatasetTemplates(dataset_name, dataset_path)
-            if subset_name == None:
+            if subset_name is None:
                prompts = DatasetTemplates(dataset_name=dataset_name)
            else:
-                prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
+                prompts = DatasetTemplates(
-        except:
+                    dataset_name=dataset_name, subset_name=subset_name
-            raise ValueError(
-                f"{dataset_name} and {subset_name} not found"
                )
+        except Exception:
+            raise ValueError(f"{dataset_name} and {subset_name} not found")
        if prompt_name in prompts.all_template_names:
            return prompts[prompt_name]
        else:
            raise ValueError(
                f"{prompt_name} not in prompt list {prompts.all_template_names}"
-                )
+            )
    else:
        try:
            return PROMPT_REGISTRY[category_name][prompt_name]
-        except:
+        except Exception:
            raise ValueError(
                f"expected only a single `:` as separator between \
                prompt category and name, but got `{prompt_id}` instead"
-                )
+            )
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -10,8 +10,8 @@ from lm_eval.api.register import (
    register_task,
    register_group,
    task_registry,
-    group_registry
+    group_registry,
-    )
+)
 def get_task_name_from_config(task_config):
@@ -28,20 +28,19 @@ for root, subdirs, file_list in os.walk(task_dir):
                    config = utils.load_yaml_config(yaml_path)
                    SubClass = type(
-                        config['task']+'ConfigurableTask',
+                        config["task"] + "ConfigurableTask",
                        (ConfigurableTask,),
-                        {'CONFIG': TaskConfig(**config)}
+                        {"CONFIG": TaskConfig(**config)},
                    )
-                    if 'task' in config:
+                    if "task" in config:
                        task_name = "{}:{}".format(
-                            get_task_name_from_config(config),
+                            get_task_name_from_config(config), config["task"]
-                            config['task']
+                        )
-                            )
                        register_task(task_name)(SubClass)
-                    if 'group' in config:
+                    if "group" in config:
-                        for group in config['group']:
+                        for group in config["group"]:
                            register_group(group)(SubClass)
                except Exception as err:
                    print(f"Unexpected {err=}, {type(err)=}")
@@ -50,6 +49,7 @@ TASK_REGISTRY = task_registry
 GROUP_REGISTRY = group_registry
 ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
 def get_task(task_name, config):
    try:
        return TASK_REGISTRY[task_name](config=config)
@@ -90,19 +90,15 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
                    if task_name not in task_name_from_registry_dict:
                        task_name_from_registry_dict = {
                            **task_name_from_registry_dict,
-                            task_name: get_task(
+                            task_name: get_task(task_name=task_name, config=config),
-                                task_name=task_name, config=config
+                        }
-                                )
-                            }
            else:
                task_name = task_element
                if task_name not in task_name_from_registry_dict:
                    task_name_from_registry_dict = {
                        **task_name_from_registry_dict,
-                        task_name: get_task(
+                        task_name: get_task(task_name=task_element, config=config),
-                            task_name=task_element, config=config
+                    }
-                            )
-                        }
        elif isinstance(task_element, dict):
            task_element.update(config)
@@ -110,22 +106,22 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
                **task_name_from_config_dict,
                get_task_name_from_config(task_element): ConfigurableTask(
                    config=task_element
-                )
+                ),
            }
        elif isinstance(task_element, Task):
            task_name_from_object_dict = {
                **task_name_from_object_dict,
-                get_task_name_from_object(task_element): task_element
+                get_task_name_from_object(task_element): task_element,
            }
    # task_name_from_registry_dict = {
    #     task_name: get_task(
    #         task_name=task_name,
    #         task_config=config
    #     )
-    #     for group_name in task_name_list for task_name in GROUP_REGISTRY[group_name] 
+    #     for group_name in task_name_list for task_name in GROUP_REGISTRY[group_name]
    #     if (isinstance(group_name, str)) and (group_name in GROUP_REGISTRY)
    # }
    # task_name_from_config_dict = {
@@ -142,11 +138,11 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
    #     if isinstance(task_object, Task)
    # }
-    assert set(task_name_from_registry_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
+    assert set(task_name_from_registry_dict.keys()).isdisjoint(
+        set(task_name_from_object_dict.keys())
+    )
    return {
        **task_name_from_registry_dict,
        **task_name_from_config_dict,
        **task_name_from_object_dict,
    }
--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
@@ -12,6 +12,7 @@ a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questi
 Homepage: https://allenai.org/data/arc
 """
+from lm_eval import utils
 from lm_eval.prompts import get_prompt
 from lm_eval.api.task import MultipleChoiceTask
@@ -27,6 +28,7 @@ _CITATION = """
 }
 """
 @register_group("arc")
 @register_task("arc_easy")
 class ARCEasy(MultipleChoiceTask):