Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into qasper

1f351067 · lintangsutawika · 50f4428b · 33d52483 · 1f351067 · 1f351067
Commit 1f351067 authored Sep 18, 2023 by lintangsutawika
19 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -63,10 +63,10 @@ jobs:
      - name: Test with pytest
        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
        # if api is modified, run tests on it
      - name: Test more tasks with pytest
        env:
          API: true
        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
 import abc
 import os
-from typing import Union, List, Tuple
+import torch
+from typing import Union, List, Tuple, Optional, Type, TypeVar
 from sqlitedict import SqliteDict
 import json
 import hashlib
@@ -11,6 +12,8 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.logger import eval_logger
+T = TypeVar("T", bound="LM")
 class LM(abc.ABC):
    def __init__(self) -> None:
@@ -111,11 +114,28 @@ class LM(abc.ABC):
        pass
    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config=None):
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
        additional_config = {} if additional_config is None else additional_config
        args = utils.simple_parse_args_string(arg_string)
        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        if args2.get("device") == "mps" or args.get("device") == "mps":
+        # TODO: delete once float16 MPS is fixed in torch stable
+        if (
+            args2.get("device") in ("mps", "mps:0")
+            or args.get("device") in ("mps", "mps:0")
+            and "dev" not in torch.__version__
+        ):
            args["dtype"] = "float32"
        return cls(**args, **args2)

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -674,11 +674,11 @@ class ConfigurableTask(Task):
            check_choices = test_choice
        else:
            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
            for choice in check_choices:
-            choice_has_whitespace = True if " " in choice else False
+                choice_has_whitespace = True if choice[0].isspace() else False
                delimiter_has_whitespace = (
-                True if " " in self.config.target_delimiter else False
+                    True if self.config.target_delimiter[-1].isspace() else False
                )
                if delimiter_has_whitespace and choice_has_whitespace:
@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task):
                # it assumes that doc_to_target returns a number.
                choices = self.doc_to_choice(doc)
                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
            else:
                gold = str(gold)
@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task):
                    # return true if any are true
                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
                    for gold_option in gold:
                        try:
                            result_score = self._metric_fn_list[metric](

--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
@@ -44,7 +44,7 @@ def include_benchmarks(task_dir: str) -> None:
                        task_names = utils.pattern_match(task_list, ALL_TASKS)
                        for task in task_names:
-                            if task in TASK_REGISTRY:
+                            if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
                                if group in GROUP_REGISTRY:
                                    GROUP_REGISTRY[group].append(task)
                                else:

--- a/lm_eval/benchmarks/pythia.yaml
+++ b/lm_eval/benchmarks/pythia.yaml
 group: pythia
 task:
  - lambada_openai
-  - wikitext
+  - logiqa
  - piqa
  - sciq
-  - wsc
+  - wikitext
  - winogrande
-  - arc
+  - wsc
-  - logiqa
+  - ai2_arc
  - blimp
  - hendrycksTest*
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -120,6 +120,8 @@ def simple_evaluate(
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj
+            if task_obj is None:
+                continue
        config = task_obj._config
        if num_fewshot is not None:
@@ -209,23 +211,30 @@ def evaluate(
    samples = collections.defaultdict(list)
    # tracks all Instances/requests a model must generate output on.
    requests = collections.defaultdict(list)
-    # Stores task scores based on task grouping.
+    # Aggregated task scores presented with groups
-    aggregate = collections.defaultdict(dict)
+    results_agg = collections.defaultdict(dict)
-    # tracks if a task was chosen via user selecting a group containing it
+    # Aggregated groups scores only
-    task_groups = collections.defaultdict(dict)
+    groups_agg = collections.defaultdict(dict)
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
    padding_requests = collections.defaultdict(int)
+    # store the hierarchy to do proper ordering
-    # Stores group related keys and values for group-aggregation
+    task_hierarchy = collections.defaultdict(list)
-    task_groups = collections.defaultdict(dict)
+    # store the ordering of tasks and groups
+    task_order = collections.defaultdict(int)
+    # store the aggregation for aggregating across tasks in the same group
+    sample_agg_fn = collections.defaultdict(dict)
    # get lists of each type of request
    for task_name, task in task_dict.items():
        if type(task) == tuple:
-            group, task = task
+            group_name, task = task
-            task_groups[task_name] = group
+            task_hierarchy[group_name].append(task_name)
-            aggregate[task_name] = {}
+        else:
+            task_hierarchy[task_name] = []
+        if task is None:
+            continue
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())
@@ -301,6 +310,8 @@ def evaluate(
    for task_name, task in task_dict.items():
        if type(task) == tuple:
            group, task = task
+            if task is None:
+                continue
        task.apply_filters()
    ### Collect values of metrics on all datapoints ###
@@ -310,6 +321,8 @@ def evaluate(
    for task_name, task in task_dict.items():
        if type(task) == tuple:
            group, task = task
+            if task is None:
+                continue
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
        for key in task.instances[0].filtered_resps.keys():
@@ -396,27 +409,64 @@ def evaluate(
        vals = vals_torch
    if lm.rank == 0:
+        ### Get task ordering for correct sample-wide aggregation
+        group_to_task = {}
+        for group in task_hierarchy.keys():
+            if group not in task_order:
+                task_order[group] = 0
+            if len(task_hierarchy[group]) > 0:
+                group_to_task[group] = task_hierarchy[group].copy()
+            for task in task_hierarchy[group]:
+                if task in task_order:
+                    task_order[task] += 1
+                else:
+                    task_order[task] = 1 + task_order[group]
+                if task in task_hierarchy:
+                    group_to_task[group].remove(task)
+                    group_to_task[group].extend(task_hierarchy[task])
+        task_to_group = {}
+        for group in group_to_task:
+            for task in group_to_task[group]:
+                if task in task_to_group:
+                    task_to_group[task].append(group)
+                else:
+                    task_to_group[task] = [group]
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
+            metric_key = metric + "," + key
            if type(task) == tuple:
-                group, task = task
+                group_name, task = task
-            task_score = task.aggregation()[metric](items)
+            else:
-            results[task_name][metric + "," + key] = task_score
+                group_name = None
-            # Need to put back in results
+            agg_fn = task.aggregation()[metric]
-            # pythia | acc
+            task_score = agg_fn(items)
-            #        | perplexity
-            #        | word_perplexity
+            if group_name is not None:
-            #        | byte_perplexity
+                sample_metric_key = metric + "(sample agg)," + key
-            #        | bits_per_byte
+                for grouping in task_to_group[task_name]:
-            if task_name in task_groups:
+                    if metric_key in results[grouping]:
-                group_name = task_groups[task_name]
+                        results[grouping][metric_key].append(task_score)
-                if metric in list(aggregate[group_name].keys()):
+                    else:
-                    aggregate[group_name][metric].append(task_score)
+                        results[grouping][metric_key] = [task_score]
+                    if sample_metric_key in results[grouping]:
+                        results[grouping][sample_metric_key] += items
                    else:
-                    aggregate[group_name][metric] = [task_score]
+                        results[grouping][sample_metric_key] = items.copy()
+                        sample_agg_fn[grouping][sample_metric_key] = agg_fn
+            results[task_name][metric_key] = task_score
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
@@ -431,19 +481,38 @@ def evaluate(
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
-        if bool(aggregate):
+        if bool(results):
-            for group in aggregate.keys():
+            for task_or_group in results.keys():
-                for metric in aggregate[group].keys():
+                for metric in results[task_or_group].keys():
-                    aggregate[group][metric] = np.average(aggregate[group][metric])
+                    if type(results[task_or_group][metric]) == list:
-                    versions[group] = "N/A"
+                        if "(sample agg)" in metric:
+                            results[task_or_group][metric] = sample_agg_fn[
+                                task_or_group
+                            ][metric](results[task_or_group][metric])
+                        else:
+                            results[task_or_group][metric] = np.average(
+                                results[task_or_group][metric]
+                            )
+                        versions[task_or_group] = "N/A"
+        for task_name, task in task_dict.items():
+            if type(task) == tuple:
+                group_name, task = task
+                order = task_order[group_name]
+                tabbed_name = "-" * order + group_name
+                results_agg[tabbed_name] = results[group_name]
+                versions[tabbed_name] = versions[group_name]
+                if order == 0:
+                    groups_agg[group_name] = results[group_name]
+            order = task_order[task_name]
+            tabbed_name = "-" * order + task_name
+            results_agg[tabbed_name] = results[task_name]
+            versions[tabbed_name] = versions[task_name]
        results_dict = {
-            "results": dict(sorted(results.items())),
+            "results": dict(results_agg.items()),
-            **(
+            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
-                {"aggregate": dict(sorted(aggregate.items()))}
-                if bool(aggregate)
-                else {}
-            ),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
        }

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -107,17 +107,20 @@ class HFLM(LM):
        if not (parallelize or accelerator.num_processes > 1):
            # use user-passed device
            device_list = set(
-                ["cuda", "cpu", "mps"]
+                ["cuda", "cpu"]
                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                + ["mps", "mps:0"]
            )
            if device:
                if device not in device_list:
                    device = int(device)
                self._device = torch.device(device)
                eval_logger.info(f"Using device '{device}'")
-                if device == "mps":
+                if device in ("mps", "mps:0") and "dev" not in torch.__version__:
                    eval_logger.info(
-                        "MPS is still in beta and only supports float32; setting dtype to float32."
+                        "MPS: Setting dtype to float32. To use float16 with MPS, please install a nightly build of "
+                        "PyTorch: pip3 install --pre torch torchvision torchaudio --index-url "
+                        "https://download.pytorch.org/whl/nightly/cpu"
                    )
            else:
                eval_logger.info("Device not specified")

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
+import ast
+from typing import Dict
 from lm_eval import utils
 from lm_eval.logger import eval_logger
@@ -5,7 +8,7 @@ from lm_eval.logger import eval_logger
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.
 # This allows us to access prompts
-PROMPT_REGISTRY: dict[str, dict[str, str]] = {
+PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
    "qa-basic": {
        "question-newline-answer": "Question: {{question}}\nAnswer:",
        "q-newline-a": "Q: {{question}}\nA:",
@@ -63,6 +66,12 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
    else:
        prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
-    category_name, prompt_name = use_prompt.split(":")
+    category_name, *prompt_name = use_prompt.split(":")
+    # TODO allow to multiple prompt naming
+    # if len(prompt_name) > 1:
+    #     prompt_list = []
+    #     for prompt in prompt_name:
+    #         prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
+    # else:
    prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
    return [":".join([category_name, prompt]) for prompt in prompt_list]
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 import os
 import yaml
-from typing import List, Union
+from typing import List, Union, Dict
 from lm_eval import utils
 from lm_eval import prompts
@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
 )
-def register_configurable_task(config: dict[str, str]) -> int:
+def register_configurable_task(config: Dict[str, str]) -> int:
    SubClass = type(
        config["task"] + "ConfigurableTask",
        (ConfigurableTask,),
@@ -38,7 +38,7 @@ def register_configurable_task(config: dict[str, str]) -> int:
    return 0
-def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
+def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
    all_configs = []
    if "use_prompt" in config:
        prompt_list = prompts.load_prompt_list(
@@ -69,7 +69,7 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
    return all_configs
-def get_task_name_from_config(task_config: dict[str, str]) -> str:
+def get_task_name_from_config(task_config: Dict[str, str]) -> str:
    if "dataset_name" in task_config:
        return "{dataset_path}_{dataset_name}".format(**task_config)
    else:
@@ -128,7 +128,7 @@ def get_task_name_from_object(task_object):
 # TODO: pass num_fewshot and other cmdline overrides in a better way
-def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
+def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
    config = {**kwargs}
@@ -136,6 +136,9 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
    task_name_from_config_dict = {}
    task_name_from_object_dict = {}
+    if type(task_name_list) != list:
+        task_name_list = [task_name_list]
    for task_element in task_name_list:
        if isinstance(task_element, str):
@@ -143,12 +146,20 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
                group_name = task_element
                for task_name in GROUP_REGISTRY[task_element]:
                    if task_name not in task_name_from_registry_dict:
+                        task_obj = get_task_dict(task_name)
+                        if task_name in task_obj.keys():
+                            task_dict = {
+                                task_name: (group_name, task_obj[task_name]),
+                            }
+                        else:
+                            task_dict = {
+                                task_name: (group_name, None),
+                                **task_obj,
+                            }
                        task_name_from_registry_dict = {
                            **task_name_from_registry_dict,
-                            task_name: (
+                            **task_dict,
-                                group_name,
-                                get_task(task_name=task_name, config=config),
-                            ),
                        }
            else:
                task_name = task_element

--- a/lm_eval/tasks/nq_open/README.md
+++ b/lm_eval/tasks/nq_open/README.md
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
+task: nq_open
+dataset_path: nq_open
+output_type: greedy_until
+training_split: train
+validation_split: validation
+description: "Answer these questions:\n"
+doc_to_text: "Q: {{question}}?\nA:"
+doc_to_target: "{{answer}}" # TODO: should be multi-target
+fewshot_delimiter: "\n"
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+    - ","
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+    - "\ban|a|the\b"
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -10,7 +10,7 @@ try:
 except ModuleNotFoundError:
    raise Exception(
        "`pycountry` is required for generating translation task prompt templates. \
-please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
+please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
    )

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -16,7 +16,6 @@ import gc
 import torch
 import transformers
-from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
    args_string = args_string.strip()
    if not args_string:
        return {}
-    arg_list = args_string.split(",")
+    arg_list = [arg for arg in args_string.split(",") if arg]
-    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
    return args_dict
@@ -267,9 +266,9 @@ def make_table(result_dict, column: str = "results"):
    from pytablewriter import MarkdownTableWriter, LatexTableWriter
    if column == "results":
-        column_name = "Task"
+        column_name = "Tasks"
-    elif column == "aggregate":
+    elif column == "groups":
-        column_name = "Benchmark"
+        column_name = "Groups"
    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()

--- a/main.py
+++ b/main.py
@@ -209,8 +209,8 @@ def main() -> None:
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))
-        if "aggregate" in results:
+        if "groups" in results:
-            print(evaluator.make_table(results, "aggregate"))
+            print(evaluator.make_table(results, "groups"))
 if __name__ == "__main__":

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
 requires = ["setuptools>=40.8.0", "wheel"]
 build-backend = "setuptools.build_meta"
+[project]
+name = "lm_eval"
+version = "1.0.0"
+authors = [
+    {name="EleutherAI", email="contact@eleuther.ai"}
+]
+description = "A framework for evaluating language models"
+readme = "README.md"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+requires-python = ">=3.9"
+license = { "text" = "MIT" }
+dependencies = [
+    "accelerate>=0.21.0",
+    "evaluate",
+    "datasets>=2.0.0",
+    "evaluate>=0.4.0",
+    "jsonlines",
+    "numexpr",
+    "peft>=0.2.0",
+    "pybind11>=2.6.2",
+    "pytablewriter",
+    "rouge-score>=0.0.4",
+    "sacrebleu>=1.5.0",
+    "scikit-learn>=0.24.1",
+    "sqlitedict",
+    "torch>=1.8",
+    "tqdm-multiprocess",
+    "transformers>=4.1",
+    "zstandard",
+]
+[tool.setuptools]
+packages = ["lm_eval"]
+# required to include yaml files in pip installation
+[tool.setuptools.package-data]
+lm_eval = ["**/*.yaml", "tasks/**/*"]
+examples = ["**/*.yaml"]
+[project.scripts]
+lm-eval = "main:main"
+lm_eval = "main:main"
+[project.urls]
+Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
+Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
+[project.optional-dependencies]
+dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
+linting = [
+    "flake8",
+    "pylint",
+    "mypy",
+    "pre-commit",
+]
+testing = ["pytest", "pytest-cov", "pytest-xdist"]
+multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+promptsource = [
+    "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
+]
+gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
+anthropic = ["anthropic"]
+openai = ["openai", "tiktoken"]
+all = [
+    "lm_eval[dev]",
+    "lm_eval[testing]",
+    "lm_eval[linting]",
+    "lm_eval[multilingual]",
+    "lm_eval[sentencepiece]",
+    "lm_eval[promptsource]",
+    "lm_eval[gptq]",
+    "lm_eval[anthropic]",
+    "lm_eval[openai]"
+]
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -38,12 +38,14 @@ def main():
        iters = []
        for set in args.sets.split(","):
+            docs = None
            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
+            if docs is not None:
                iters.append(docs)
        docs = join_iters(iters)

--- a/setup.py
+++ b/setup.py
 import setuptools
-import itertools
-with open("README.md", "r", encoding="utf-8") as fh:
+# This is to make sure that the package supports editable installs
-    long_description = fh.read()
+setuptools.setup()
-extras_require = {
-    "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
-    "linting": [
-        "flake8",
-        "pylint",
-        "mypy",
-        "pre-commit",
-    ],
-    "testing": ["pytest", "pytest-cov", "pytest-xdist"],
-    "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"],
-    "promptsource": [
-        "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
-    ],
-    "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
-    "anthropic": ["anthropic"],
-    "openai": ["openai", "tiktoken"],
-}
-extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
-setuptools.setup(
-    name="lm_eval",
-    version="1.0.0",
-    author="EleutherAI",
-    author_email="contact@eleuther.ai",
-    description="A framework for evaluating language models",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/EleutherAI/lm-evaluation-harness",
-    packages=setuptools.find_packages(),
-    # required to include yaml files in pip installation
-    package_data={
-        "lm_eval": ["**/*.yaml", "tasks/**/*"],
-        "examples": ["**/*.yaml"],
-    },
-    entry_points={
-        "console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
-    },
-    include_package_data=True,
-    classifiers=[
-        "Development Status :: 3 - Alpha",
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires=">=3.9",
-    install_requires=[
-        "accelerate>=0.21.0",
-        "evaluate",
-        "datasets>=2.0.0",
-        "evaluate>=0.4.0",
-        "jsonlines",
-        "numexpr",
-        "omegaconf>=2.2",
-        "peft>=0.2.0",
-        "pybind11>=2.6.2",
-        "pytablewriter",
-        "rouge-score>=0.0.4",
-        "sacrebleu>=1.5.0",
-        "scikit-learn>=0.24.1",
-        "sqlitedict",
-        "torch>=1.8",
-        "tqdm-multiprocess",
-        "transformers>=4.1",
-        "zstandard",
-    ],
-    extras_require=extras_require,
-)
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
+from typing import List
 import random
 import pytest
@@ -26,7 +27,7 @@ import pytest
        )
    ],
 )
-def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
    task_name = task_name
    limit = 10

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -9,6 +9,7 @@ import os
 # This is the path where the output for the changed files for the tasks folder is stored
 # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
 # reads a text file and returns a list of words
 # used to read the output of the changed txt from tj-actions/changed-files
 def load_changed_files(file_path: str) -> List[str]:
@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
    return list(_output)
-def new_tasks() -> Union[list[str], None]:
+def new_tasks() -> Union[List[str], None]:
    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
    if os.path.exists(FILENAME):
        # If tasks folder has changed then we get the list of files from FILENAME