Merge branch 'master' into cmmlu

c2bf7f32 · Stella Biderman · GitHub · 26621176 · 3ccea2b2 · c2bf7f32
Unverified Commit c2bf7f32 authored Nov 01, 2023 by Stella Biderman Committed by GitHub Nov 01, 2023
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
-FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04
+FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
-### Install python 3.10 and set it as default python interpreter 
+### Install python 3.10 and set it as default python interpreter
 RUN  apt update &&  apt install software-properties-common -y && \
 add-apt-repository ppa:deadsnakes/ppa -y &&  apt update && \
 apt install curl -y && \
@@ -13,7 +13,7 @@ curl -Ss https://bootstrap.pypa.io/get-pip.py | python3.10 && \
 apt-get clean && rm -rf /var/lib/apt/lists/
-### Copy files 
+### Copy files
 COPY . /lm-evaluation-harness/
 ### Set working directory
@@ -22,9 +22,6 @@ WORKDIR /lm-evaluation-harness
 ### Install requirements
-RUN pip install --no-cache-dir -e . 
+RUN pip install --no-cache-dir -e .
 ### Run bash
 CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by:
 1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
-If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with: 
+If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
 - A shell command to run the task in the `master` branch, and what the score is
 - A shell command to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.

--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -309,7 +309,9 @@ class BaseLM(LM):
            if override_bs is not None
            else 0,
            fn=_batch_scheduler
-            if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
            else None,
        ):
            inps = []
@@ -375,7 +377,9 @@ class BaseLM(LM):
                # Slice to original seq length
                contlen = len(cont_toks)
-                inplen = inplen + (logits.shape[0] - padding_length) # if "virtual tokens" (from prompt tuning) are added, inplen is larger
+                inplen = inplen + (
+                    logits.shape[0] - padding_length
+                )  # if "virtual tokens" (from prompt tuning) are added, inplen is larger
                logits = logits[inplen - contlen : inplen].unsqueeze(
                    0
                )  # [1, seq, vocab]

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -74,14 +74,19 @@ def simple_evaluate(
        if model_args is None:
            model_args = ""
        lm = lm_eval.models.get_model(model).create_from_arg_string(
-            model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
+            model_args,
+            {
+                "batch_size": batch_size,
+                "max_batch_size": max_batch_size,
+                "device": device,
+            },
        )
    elif isinstance(model, transformers.PreTrainedModel):
        lm = lm_eval.models.get_model("hf-causal")(
-                pretrained=model,
+            pretrained=model,
-                batch_size=batch_size,
+            batch_size=batch_size,
-                max_batch_size=max_batch_size,
+            max_batch_size=max_batch_size,
-                )
+        )
        no_cache = True
    else:
        assert isinstance(model, lm_eval.base.LM)
@@ -125,7 +130,9 @@ def simple_evaluate(
        "model_args": model_args,
        "num_fewshot": num_fewshot,
        "batch_size": batch_size,
-        "batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
+        "batch_sizes": list(lm.batch_sizes.values())
+        if hasattr(lm, "batch_sizes")
+        else [],
        "device": device,
        "no_cache": no_cache,
        "limit": limit,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -4,9 +4,7 @@ from typing import Optional, Union
 from lm_eval.base import BaseLM
-def _get_dtype(
+def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
-    dtype: Union[str, torch.dtype]
-) -> torch.dtype:
    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
    if isinstance(dtype, str) and dtype != "auto":
        # Convert `str` args torch dtype: `float16` -> `torch.float16`
@@ -33,11 +31,10 @@ class HFLM(BaseLM):
        max_length=None,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
-        dtype: Optional[Union[str, torch.dtype]]="auto",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
    ):
        super().__init__()
        # Initialize model
        if isinstance(pretrained, transformers.PreTrainedModel):
            self.model = pretrained
@@ -45,28 +42,25 @@ class HFLM(BaseLM):
            if tokenizer:
                assert isinstance(
-                        tokenizer,
+                    tokenizer, transformers.PreTrainedTokenizer
-                        transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
-                        ) or isinstance(
-                        tokenizer,
-                        transformers.PreTrainedTokenizerFast
-                        )
                self.tokenizer = tokenizer
            else:
                # Get tokenizer
                model_name = self.model.name_or_path
                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                        model_name,
+                    model_name,
-                        revision=revision,
+                    revision=revision,
-                        trust_remote_code=trust_remote_code,
+                    trust_remote_code=trust_remote_code,
-                        )
+                )
        elif isinstance(pretrained, str):
            # Initialize device
            assert isinstance(device, str)
            device_list = set(
-                ["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                ["cuda", "cpu"]
+                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
            )
            if device and device in device_list:
                self._device = torch.device(device)
@@ -83,21 +77,23 @@ class HFLM(BaseLM):
            # Initialize new model and tokenizer instances
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
-                    pretrained,
+                pretrained,
-                    load_in_8bit=load_in_8bit,
+                load_in_8bit=load_in_8bit,
-                    low_cpu_mem_usage=low_cpu_mem_usage,
+                low_cpu_mem_usage=low_cpu_mem_usage,
-                    revision=revision,
+                revision=revision,
-                    torch_dtype=_get_dtype(dtype),
+                torch_dtype=_get_dtype(dtype),
-                    trust_remote_code=trust_remote_code,
+                trust_remote_code=trust_remote_code,
-                    ).to(self.device)
+            ).to(self.device)
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                    tokenizer if tokenizer else pretrained,
+                tokenizer if tokenizer else pretrained,
-                    revision=revision,
+                revision=revision,
-                    trust_remote_code=trust_remote_code,
+                trust_remote_code=trust_remote_code,
-                    )
+            )
        else:
-            raise TypeError('Parameter pretrained should be of type str or transformers.PreTrainedModel')
+            raise TypeError(
+                "Parameter pretrained should be of type str or transformers.PreTrainedModel"
+            )
        self.model.eval()
@@ -124,7 +120,7 @@ class HFLM(BaseLM):
    @property
    def max_length(self):
-        if self._max_length: # if max length manually set, return it
+        if self._max_length:  # if max length manually set, return it
            return self._max_length
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
        for attr in seqlen_config_attrs:
@@ -136,7 +132,6 @@ class HFLM(BaseLM):
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH
    @property
    def max_gen_toks(self):
        return 256
@@ -171,8 +166,10 @@ class HFLM(BaseLM):
    def _model_generate(self, context, max_length, eos_token_id):
        generation_kwargs = {"do_sample": False, "max_length": max_length}
        if eos_token_id is not None:
-            generation_kwargs['eos_token_id'] = eos_token_id
+            generation_kwargs["eos_token_id"] = eos_token_id
-            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
+            generation_kwargs[
+                "pad_token_id"
+            ] = eos_token_id  # setting eos_token_id as pad token
        return self.model.generate(context, **generation_kwargs)

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -198,14 +198,13 @@ class GPT3LM(BaseLM):
                context_enc = self.tok_encode(context)
                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
                inps.append(inp)
            response = oa_completion(
                engine=self.engine,
                prompt=inps,
                max_tokens=self.max_gen_toks,
                temperature=0.0,
                logprobs=10,
-                stop=until,
+                stop=until["until"],
            )
            for resp, (context, until_) in zip(response.choices, chunk):

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -19,7 +19,6 @@ _DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.dev
 def _get_accelerate_args(
-    low_cpu_mem_usage: Optional[bool] = True,
    device_map_option: Optional[str] = "auto",
    max_memory_per_gpu: Optional[Union[int, str]] = None,
    max_cpu_memory: Optional[Union[int, str]] = None,
@@ -39,7 +38,6 @@ def _get_accelerate_args(
    args = {}
    if max_memory:
        args["max_memory"] = max_memory
-    args["low_cpu_mem_usage"] = low_cpu_mem_usage
    args["device_map"] = device_map_option
    args["offload_folder"] = offload_folder
    return args
@@ -94,6 +92,7 @@ class HuggingFaceAutoLM(BaseLM):
        load_in_4bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
        gptq_use_triton: Optional[bool] = False,
+        inject_fused_attention: Optional[bool] = True,
        bnb_4bit_quant_type: Optional[str] = None,
        bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
        bnb_4bit_use_double_quant: Optional[bool] = False,
@@ -160,6 +159,8 @@ class HuggingFaceAutoLM(BaseLM):
                If True, will trust the remote code when loading the model.
            gptq_use_triton (bool, optional, defaults to False):
                Use Triton for GPTQ inference.
+            inject_fused_attention (bool, optional, defaults to True):
+                Inject fused attention into GPTQ model.
            bnb_4bit_quant_type (str, optional, defaults to None):
                The quantization type to use for BnB 4bit quantization. See:
                https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L77
@@ -219,7 +220,6 @@ class HuggingFaceAutoLM(BaseLM):
        model_kwargs = {}
        if use_accelerate:
            model_kwargs = _get_accelerate_args(
-                low_cpu_mem_usage,
                device_map_option,
                max_memory_per_gpu,
                max_cpu_memory,
@@ -233,11 +233,13 @@ class HuggingFaceAutoLM(BaseLM):
            subfolder=subfolder,
            torch_dtype=_get_dtype(dtype, self._config),
            gptq_use_triton=gptq_use_triton,
+            inject_fused_attention=inject_fused_attention,
            load_in_8bit=load_in_8bit,
            load_in_4bit=load_in_4bit,
            bnb_4bit_quant_type=bnb_4bit_quant_type,
            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
            bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
+            low_cpu_mem_usage=low_cpu_mem_usage,
            **model_kwargs,
        )
        # note: peft_path can be different than pretrained model path
@@ -262,7 +264,9 @@ class HuggingFaceAutoLM(BaseLM):
            try:
                self.model.to(self._device)
            except:
-                print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
+                print(
+                    "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
+                )
    def _create_auto_model(
        self,
@@ -280,6 +284,7 @@ class HuggingFaceAutoLM(BaseLM):
        trust_remote_code: Optional[bool] = False,
        torch_dtype: Optional[Union[str, torch.dtype]] = None,
        gptq_use_triton: Optional[bool] = False,
+        inject_fused_attention: Optional[bool] = True,
        bnb_4bit_quant_type: Optional[str] = None,
        bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
        bnb_4bit_use_double_quant: Optional[bool] = False,
@@ -287,7 +292,9 @@ class HuggingFaceAutoLM(BaseLM):
        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
        if not quantized:
            if load_in_4bit:
-                assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0"
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
            model_kwargs = {}
            if transformers.__version__ >= "4.30.0":
                model_kwargs["load_in_4bit"] = load_in_4bit
@@ -295,9 +302,13 @@ class HuggingFaceAutoLM(BaseLM):
                    if bnb_4bit_quant_type:
                        model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
                    if bnb_4bit_compute_dtype:
-                        model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype)
+                        model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(
+                            bnb_4bit_compute_dtype
+                        )
                    if bnb_4bit_use_double_quant:
-                        model_kwargs["bnb_4bit_use_double_quant"] = bnb_4bit_use_double_quant
+                        model_kwargs[
+                            "bnb_4bit_use_double_quant"
+                        ] = bnb_4bit_use_double_quant
            model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
                revision=revision + ("/" + subfolder if subfolder is not None else ""),
@@ -312,15 +323,19 @@ class HuggingFaceAutoLM(BaseLM):
            )
        else:
            from auto_gptq import AutoGPTQForCausalLM
            model = AutoGPTQForCausalLM.from_quantized(
                pretrained,
                model_basename=None if quantized == True else Path(quantized).stem,
                device_map=device_map,
                max_memory=max_memory,
                trust_remote_code=trust_remote_code,
-                use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
+                use_safetensors=True
+                if quantized == True
+                else quantized.endswith(".safetensors"),
                use_triton=gptq_use_triton,
                warmup_triton=gptq_use_triton,
+                inject_fused_attention=inject_fused_attention,
            )
        return model

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -20,6 +20,7 @@ from . import swag
 from . import openbookqa
 from . import squad
 from . import naturalqs
+from . import nqopen
 from . import sat
 from . import arithmetic
 from . import lambada
@@ -151,6 +152,7 @@ TASK_REGISTRY = {
    "squad2": squad.SQuAD2,
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
+    "nq_open": nqopen.NQOpen,
    "headqa": headqa.HeadQAEsDeprecated,  # for backwards compat - headqa used to default to es
    "headqa_es": headqa.HeadQAEs,
    "headqa_en": headqa.HeadQAEn,
@@ -328,11 +330,11 @@ TASK_REGISTRY = {
    "csatqa_rch": csatqa.RCH,
    "csatqa_li": csatqa.LI,
    "haerae_hi": haerae.HI,
-    "haerae_kgk":haerae.KGK,
+    "haerae_kgk": haerae.KGK,
-    "haerae_lw":haerae.LW,
+    "haerae_lw": haerae.LW,
-    "haerae_rc":haerae.RC,
+    "haerae_rc": haerae.RC,
-    "haerae_rw":haerae.RW,
+    "haerae_rw": haerae.RW,
-    "haerae_sn":haerae.SN,
+    "haerae_sn": haerae.SN,
    # Requires manual download
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,

--- a/lm_eval/tasks/babi.py
+++ b/lm_eval/tasks/babi.py
@@ -16,6 +16,7 @@ _CITATION = """
 }
 """
 class Babi(Task):
    VERSION = 0
    DATASET_PATH = "Muennighoff/babi"
@@ -43,18 +44,16 @@ class Babi(Task):
            return self.dataset["test"]
    def doc_to_text(self, doc):
-        return (
+        return doc["passage"] + doc["question"]
-            doc['passage'] + doc['question']
-        )
    def should_decontaminate(self):
-        return False # TODO Necessary?
+        return False  # TODO Necessary?
    def doc_to_decontamination_query(self, doc):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
    def doc_to_target(self, doc):
-        return " " + doc['answer']
+        return " " + doc["answer"]
    def construct_requests(self, doc, ctx):
        """Uses RequestFactory to construct Requests and returns an iterable of

--- a/lm_eval/tasks/ceval.py
+++ b/lm_eval/tasks/ceval.py
@@ -12,7 +12,7 @@ from lm_eval.base import MultipleChoiceTask
 _CITATION = """
 @article{huang2023ceval,
-    title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models}, 
+    title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
    author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
    journal={arXiv preprint arXiv:2305.08322},
    year={2023}
@@ -21,58 +21,58 @@ _CITATION = """
 SUBJECTS = {
-    "computer_network":"计算机网络",
+    "computer_network": "计算机网络",
-    "operating_system":"操作系统",
+    "operating_system": "操作系统",
-    "computer_architecture":"计算机组成",
+    "computer_architecture": "计算机组成",
-    "college_programming":"大学编程",
+    "college_programming": "大学编程",
-    "college_physics":"大学物理",
+    "college_physics": "大学物理",
-    "college_chemistry":"大学化学",
+    "college_chemistry": "大学化学",
-    "advanced_mathematics":"高等数学",
+    "advanced_mathematics": "高等数学",
-    "probability_and_statistics":"概率统计",
+    "probability_and_statistics": "概率统计",
-    "discrete_mathematics":"离散数学",
+    "discrete_mathematics": "离散数学",
-    "electrical_engineer":"注册电气工程师",
+    "electrical_engineer": "注册电气工程师",
-    "metrology_engineer":"注册计量师",
+    "metrology_engineer": "注册计量师",
-    "high_school_mathematics":"高中数学",
+    "high_school_mathematics": "高中数学",
-    "high_school_physics":"高中物理",
+    "high_school_physics": "高中物理",
-    "high_school_chemistry":"高中化学",
+    "high_school_chemistry": "高中化学",
-    "high_school_biology":"高中生物",
+    "high_school_biology": "高中生物",
-    "middle_school_mathematics":"初中数学",
+    "middle_school_mathematics": "初中数学",
-    "middle_school_biology":"初中生物",
+    "middle_school_biology": "初中生物",
-    "middle_school_physics":"初中物理",
+    "middle_school_physics": "初中物理",
-    "middle_school_chemistry":"初中化学",
+    "middle_school_chemistry": "初中化学",
-    "veterinary_medicine":"兽医学",
+    "veterinary_medicine": "兽医学",
-    "college_economics":"大学经济学",
+    "college_economics": "大学经济学",
-    "business_administration":"工商管理",
+    "business_administration": "工商管理",
-    "marxism":"马克思主义基本原理",
+    "marxism": "马克思主义基本原理",
-    "mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论",
+    "mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
-    "education_science":"教育学",
+    "education_science": "教育学",
-    "teacher_qualification":"教师资格",
+    "teacher_qualification": "教师资格",
-    "high_school_politics":"高中政治",
+    "high_school_politics": "高中政治",
-    "high_school_geography":"高中地理",
+    "high_school_geography": "高中地理",
-    "middle_school_politics":"初中政治",
+    "middle_school_politics": "初中政治",
-    "middle_school_geography":"初中地理",
+    "middle_school_geography": "初中地理",
-    "modern_chinese_history":"近代史纲要",
+    "modern_chinese_history": "近代史纲要",
-    "ideological_and_moral_cultivation":"思想道德修养与法律基础",
+    "ideological_and_moral_cultivation": "思想道德修养与法律基础",
-    "logic":"逻辑学",
+    "logic": "逻辑学",
-    "law":"法学",
+    "law": "法学",
-    "chinese_language_and_literature":"中国语言文学",
+    "chinese_language_and_literature": "中国语言文学",
-    "art_studies":"艺术学",
+    "art_studies": "艺术学",
-    "professional_tour_guide":"导游资格",
+    "professional_tour_guide": "导游资格",
-    "legal_professional":"法律职业资格",
+    "legal_professional": "法律职业资格",
-    "high_school_chinese":"高中语文",
+    "high_school_chinese": "高中语文",
-    "high_school_history":"高中历史",
+    "high_school_history": "高中历史",
-    "middle_school_history":"初中历史",
+    "middle_school_history": "初中历史",
-    "civil_servant":"公务员",
+    "civil_servant": "公务员",
-    "sports_science":"体育学",
+    "sports_science": "体育学",
-    "plant_protection":"植物保护",
+    "plant_protection": "植物保护",
-    "basic_medicine":"基础医学",
+    "basic_medicine": "基础医学",
-    "clinical_medicine":"临床医学",
+    "clinical_medicine": "临床医学",
-    "urban_and_rural_planner":"注册城乡规划师",
+    "urban_and_rural_planner": "注册城乡规划师",
-    "accountant":"注册会计师",
+    "accountant": "注册会计师",
-    "fire_engineer":"注册消防工程师",
+    "fire_engineer": "注册消防工程师",
-    "environmental_impact_assessment_engineer":"环境影响评价工程师",
+    "environmental_impact_assessment_engineer": "环境影响评价工程师",
-    "tax_accountant":"税务师",
+    "tax_accountant": "税务师",
-    "physician":"医师资格"
+    "physician": "医师资格",
 }
@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask):
    def validation_docs(self):
        if self.has_validation_docs():
-            return map(self._process_doc,self.dataset["val"])
+            return map(self._process_doc, self.dataset["val"])
    def test_docs(self):
        if self.has_test_docs():
-            return map(self._process_doc,self.dataset["test"])
+            return map(self._process_doc, self.dataset["test"])
    def _format_subject(self, subject):
        words = subject.split("_")
@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask):
    def fewshot_context(self, doc, num_fewshot, **kwargs):
        subject = self.DATASET_NAME
-        description= f"以下是中国关于{SUBJECTS[subject]}的单项选择题，请选出其中的正确答案。"
+        description = f"以下是中国关于{SUBJECTS[subject]}的单项选择题，请选出其中的正确答案。"
        kwargs["description"] = description
        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask):
            """
            question = doc["question"].strip()
-            choices = "".join(
+            choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
-                [f'{key}. {doc[key]}\n' for key in keys]
-            )
            prompt = f"{question}\n{choices}答案："
            return prompt
@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask):
        return {
            "query": format_example(doc, keys),
            "choices": keys,
-            "gold": ord(doc["answer"])-ord("A"),
+            "gold": ord(doc["answer"]) - ord("A"),
        }
    def fewshot_examples(self, k, rnd):

--- a/lm_eval/tasks/cmmlu.py
+++ b/lm_eval/tasks/cmmlu.py
@@ -109,16 +109,16 @@ SUBJECT_MAPPING = {
    "chinese_driving_rule": "中国驾驶规则",
    "chinese_food_culture": "中国饮食文化",
    "chinese_foreign_policy": "中国外交政策",
-    "chinese_history":"中国历史",
+    "chinese_history": "中国历史",
    "chinese_literature": "中国文学",
    "chinese_teacher_qualification": "中国教师资格",
    "clinical_knowledge": "临床知识",
-    "college_actuarial_science":"大学精算学",
+    "college_actuarial_science": "大学精算学",
-    "college_education":"大学教育学",
+    "college_education": "大学教育学",
    "college_engineering_hydrology": "大学工程水文学",
    "college_law": "大学法律",
    "college_mathematics": "大学数学",
-    "college_medical_statistics":"大学医学统计",
+    "college_medical_statistics": "大学医学统计",
    "college_medicine": "大学医学",
    "computer_science": "计算机科学",
    "computer_security": "计算机安全",
@@ -127,8 +127,8 @@ SUBJECT_MAPPING = {
    "economics": "经济学",
    "education": "教育学",
    "electrical_engineering": "电气工程",
-    "elementary_chinese":"小学语文",
+    "elementary_chinese": "小学语文",
-    "elementary_commonsense":"小学常识",
+    "elementary_commonsense": "小学常识",
    "elementary_information_and_technology": "小学信息技术",
    "elementary_mathematics": "初等数学",
    "ethnology": "民族学",
@@ -159,12 +159,12 @@ SUBJECT_MAPPING = {
    "professional_medicine": "专业医学",
    "professional_psychology": "专业心理学",
    "public_relations": "公共关系",
-    "security_study":"安全研究",
+    "security_study": "安全研究",
    "sociology": "社会学",
    "sports_science": "体育学",
    "traditional_chinese_medicine": "中医中药",
    "virology": "病毒学",
-    "world_history":"世界历史",
+    "world_history": "世界历史",
    "world_religions": "世界宗教",
 }

--- a/lm_eval/tasks/csatqa.py
+++ b/lm_eval/tasks/csatqa.py
@@ -16,7 +16,7 @@ class CSATQA(MultipleChoiceTask):
    def test_docs(self):
        return map(self._process_doc, self.dataset["test"])
    def _process_doc(self, doc):
        instruction = f"""다음을 읽고 정답으로 알맞은 것을 고르시요.
 ### Context: {doc["context"]}
@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask):
 (1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
 ### Answer: 주어진 문제의 정답은"""
-        choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]]
+        choices = [
+            doc["option#1"],
+            doc["option#2"],
+            doc["option#3"],
+            doc["option#4"],
+            doc["option#5"],
+        ]
        out_doc = {
            "question": instruction,
-            "choices": ["(1)", "(2)","(3)","(4)","(5)"],
+            "choices": ["(1)", "(2)", "(3)", "(4)", "(5)"],
-            "gold": int(doc['gold'])-1,
+            "gold": int(doc["gold"]) - 1,
        }
        return out_doc
@@ -40,18 +46,23 @@ class CSATQA(MultipleChoiceTask):
 class WR(CSATQA):
    DATASET_NAME = "WR"
 class GR(CSATQA):
    DATASET_NAME = "GR"
 class RCS(CSATQA):
    DATASET_NAME = "RCS"
 class RCSS(CSATQA):
    DATASET_NAME = "RCSS"
 class RCH(CSATQA):
    DATASET_NAME = "RCH"
 class LI(CSATQA):
    DATASET_NAME = "LI"
--- a/lm_eval/tasks/haerae.py
+++ b/lm_eval/tasks/haerae.py
@@ -16,7 +16,7 @@ class Haerae(MultipleChoiceTask):
    def test_docs(self):
        return map(self._process_doc, self.dataset["test"])
    def _process_doc(self, doc):
        choices = [doc["o1"], doc["o2"], doc["o3"], doc["o4"]]
        if doc.get("o5") is not None:
@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask):
        out_doc = {
            "query": doc["query"],
            "choices": choices,
-            "gold": int(doc['gold'])-1,
+            "gold": int(doc["gold"]) - 1,
        }
        return out_doc

--- a/lm_eval/tasks/nqopen.py
+++ b/lm_eval/tasks/nqopen.py
+"""
+Latent Retrieval for Weakly Supervised Open Domain Question Answering
+https://arxiv.org/pdf/1906.00300.pdf
+Natural Questions: a Benchmark for Question Answering Research
+https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf
+The NQ-Open task, introduced by Lee et. al. 2019, is an open-domain question
+answering benchmark that is derived from Natural Questions. The goal is to predict
+an English answer string for an input English question. All questions can be
+answered using the contents of English Wikipedia.
+Homepage: https://github.com/google-research-datasets/natural-questions/tree/master/nq_open
+"""
+import regex
+import string
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+_CITATION = """
+@inproceedings{lee-etal-2019-latent,
+    title = "Latent Retrieval for Weakly Supervised Open Domain Question Answering",
+    author = "Lee, Kenton  and
+      Chang, Ming-Wei  and
+      Toutanova, Kristina",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/P19-1612",
+    doi = "10.18653/v1/P19-1612",
+    pages = "6086--6096",
+    abstract = "Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.",
+}
+"""
+class NQOpen(Task):
+    VERSION = 0
+    DATASET_PATH = "nq_open"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        return self.dataset["train"]
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        raise NotImplementedError()
+    def doc_to_text(self, doc):
+        return f"Q: {doc['question']}\nA:"
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["question"]
+    def doc_to_target(self, doc):
+        return " " + doc["answer"][0]
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+                The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+                The context string, generated by fewshot_context. This includes the natural
+                language description, as well as the few shot examples, and the question
+                part of the document for `doc`.
+        """
+        continuation = rf.greedy_until(ctx, {"until": ["\n", ".", ","]})
+        return continuation
+    def _normalize_answer(self, text):
+        # Lowercase and remove punctuation, strip whitespace
+        text = text.strip().lower().translate(str.maketrans("", "", string.punctuation))
+        # Remove articles, resulting in duplicate whitespace
+        text = regex.sub(r"\b(a|an|the)\b", " ", text)
+        # Remove duplicate whitespace
+        text = " ".join(text.split())
+        return text
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        continuation = self._normalize_answer(results[0])
+        answers = [self._normalize_answer(answer) for answer in doc["answer"]]
+        return {"em": float(continuation in answers)}
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "em": mean,
+        }
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "em": True,
+        }
--- a/lm_eval/tasks/pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa.py
@@ -33,34 +33,43 @@ _CITATION = """
 class Pubmed_QA(Task):
    VERSION = 0
-    DATASET_PATH = "pubmed_qa"
+    DATASET_PATH = "bigbio/pubmed_qa"
-    DATASET_NAME = "pqa_labeled"
+    DATASET_NAME = "pubmed_qa_labeled_fold0_source"
    def has_training_docs(self):
-        return False
+        return True
    def has_validation_docs(self):
-        return False
+        return True
    def has_test_docs(self):
        return True
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                self._training_docs = self.dataset["train"]
+            return self._training_docs
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
    def test_docs(self):
        if self.has_test_docs():
-            # HF is labelled as train but its really just for testing
+            return self.dataset["test"]
-            return self.dataset["train"]
    def doc_to_text(self, doc):
-        ctxs = "\n".join(doc["context"]["contexts"])
+        ctxs = "\n".join(doc["CONTEXTS"])
        return "Abstract: {}\nQuestion: {}\nAnswer:".format(
-            ctxs, doc["question"], doc["final_decision"]
+            ctxs, doc["QUESTION"], doc["final_decision"]
        )
    def should_decontaminate(self):
        return True
    def doc_to_decontamination_query(self, doc):
-        return doc["question"] + " " + "\n".join(doc["context"]["contexts"])
+        return doc["question"] + " " + "\n".join(doc["CONTEXTS"])
    def doc_to_target(self, doc):
        return " {}".format(doc["final_decision"])

--- a/lm_eval/tasks/scrolls.py
+++ b/lm_eval/tasks/scrolls.py
@@ -42,7 +42,7 @@ import re
 _CITATION = """
 @inproceedings{shaham-etal-2022-scrolls,
    title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
-    author = "Shaham, Uri  and 
+    author = "Shaham, Uri  and
      Segal, Elad  and
      Ivgi, Maor  and
      Efrat, Avia  and
@@ -72,9 +72,14 @@ def _download_metric():
    import os
    import shutil
    from huggingface_hub import hf_hub_download
-    scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py")
+    scrolls_metric_path = hf_hub_download(
+        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+    )
    updated_scrolls_metric_path = (
-        os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
+        os.path.dirname(scrolls_metric_path)
+        + os.path.basename(scrolls_metric_path).replace(".", "_")
+        + ".py"
    )
    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
    return updated_scrolls_metric_path
@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc):
        "input": input,
        "outputs": doc["outputs"],
        "question": input[0:split],
-        "text": input[split + 2:]
+        "text": input[split + 2 :],
    }
@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset):
    indices_to_keep = []
    id_to_idx = {}
    outputs = []
-    for i, (id_, output) in enumerate(zip(untokenized_dataset["id"], untokenized_dataset["output"])):
+    for i, (id_, output) in enumerate(
+        zip(untokenized_dataset["id"], untokenized_dataset["output"])
+    ):
        if id_ in id_to_idx:
            outputs[id_to_idx[id_]].append(output)
            continue
@@ -119,9 +126,11 @@ def _num_cpu_cores():
    # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
    try:
        import psutil
        return psutil.cpu_count(logical=False)
    except ImportError:
        import os
        return len(os.sched_getaffinity(0))
@@ -135,7 +144,11 @@ class _SCROLLSTask(Task):
    def __init__(self, no_metric=False):
        super().__init__()
-        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) if not no_metric else None
+        self.metric = (
+            load_metric(_download_metric(), config_name=self.DATASET_NAME)
+            if not no_metric
+            else None
+        )
    def has_training_docs(self):
        return True
@@ -176,7 +189,10 @@ class _SCROLLSTask(Task):
        that are less than `max_tokens` when tokenized by each tokenizer
        """
-        tokenizers = [AutoTokenizer.from_pretrained(tokenizer) for tokenizer in self.PRUNE_TOKENIZERS]
+        tokenizers = [
+            AutoTokenizer.from_pretrained(tokenizer)
+            for tokenizer in self.PRUNE_TOKENIZERS
+        ]
        cache = {}
        def _filter(sample):
@@ -210,18 +226,21 @@ class _SCROLLSTask(Task):
    def _make_compute_metrics(self, value):
        def compute_metrics(samples):
            predictions, references = zip(*samples)  # unzip, if you will
-            computed = self.metric.compute(predictions=predictions, references=references)
+            computed = self.metric.compute(
+                predictions=predictions, references=references
+            )
            return computed[value]
        return compute_metrics
    def aggregation(self):
        return {
-            key: self._make_compute_metrics(value) for key, value in self._scrolls_metrics().items()
+            key: self._make_compute_metrics(value)
+            for key, value in self._scrolls_metrics().items()
        }
 class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
    def __init__(self):
        super().__init__(no_metric=True)
@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
        return None
    def aggregation(self):
-        return {
+        return {"em": mean, "acc": mean, "acc_norm": mean}
-            "em": mean,
-            "acc": mean,
-            "acc_norm": mean
-        }
    def higher_is_better(self):
-        return {
+        return {"em": True, "acc": True, "acc_norm": True}
-            "em": True,
-            "acc": True,
-            "acc_norm": True
-        }
    def process_results(self, doc, results):
        gold = doc["gold"]
@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
 class _SCROLLSSummaryTask(_SCROLLSTask):
    def _process_doc(self, doc):
        return [doc]
    def _scrolls_metrics(self):
-        return {"rouge1": "rouge/rouge1", "rouge2": "rouge/rouge2", "rougeL": "rouge/rougeL"}
+        return {
+            "rouge1": "rouge/rouge1",
+            "rouge2": "rouge/rouge2",
+            "rougeL": "rouge/rougeL",
+        }
    def process_results(self, doc, results):
        return {
            "rouge1": (results[0], doc["outputs"]),
            "rouge2": (results[0], doc["outputs"]),
-            "rougeL": (results[0], doc["outputs"])
+            "rougeL": (results[0], doc["outputs"]),
        }
    def construct_requests(self, doc, ctx):
-        return [rf.greedy_until(ctx, {'until': ["\n"]})]
+        return [rf.greedy_until(ctx, {"until": ["\n"]})]
    def doc_to_text(self, doc):
        return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask):
    def _process_doc(self, doc):
        doc = _process_doc_prepended_question(doc)
-        doc["is_yes_no"] = reduce(lambda prev, cur: prev and squad_metrics.normalize_answer(cur)
+        doc["is_yes_no"] = reduce(
-                                  in ["yes", "no"], doc["outputs"], True)
+            lambda prev, cur: prev
+            and squad_metrics.normalize_answer(cur) in ["yes", "no"],
+            doc["outputs"],
+            True,
+        )
        return [doc]
    def _scrolls_metrics(self):
@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask):
            prediction = "Unanswerable"
        else:
            prediction = results[0]
-        return {
+        return {"f1": (prediction, doc["outputs"])}
-            "f1": (prediction, doc["outputs"])
-        }
    def construct_requests(self, doc, ctx):
        if doc["is_yes_no"]:
@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask):
            ll_no, _ = rf.loglikelihood(ctx, " no")
            return [ll_yes, ll_no]
        else:
-            return [rf.greedy_until(ctx, {'until': ["\n"]})]
+            return [rf.greedy_until(ctx, {"until": ["\n"]})]
 class QuALITY(_SCROLLSMultipleChoiceTask):
@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
        choices_text = doc["text"][:split]
        doc["text"] = doc["text"][split:].strip()
-        doc["choices"] = [QuALITY._normalize_answer(choice) for choice in re.split(
+        doc["choices"] = [
-            QuALITY._multiple_choice_pattern, choices_text)[1:]]
+            QuALITY._normalize_answer(choice)
+            for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
+        ]
        doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
        return [doc]
@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask):
        return self._process_doc(doc)[0]["text"]
    def process_results(self, doc, results):
-        return {
+        return {"f1": (results[0], doc["outputs"])}
-            "f1": (results[0], doc["outputs"])
-        }
    def construct_requests(self, doc, ctx):
-        return [rf.greedy_until(ctx, {'until': ["\n"]})]
+        return [rf.greedy_until(ctx, {"until": ["\n"]})]
 class ContractNLI(_SCROLLSMultipleChoiceTask):
@@ -439,5 +455,5 @@ def construct_tasks():
        "scrolls_contractnli": ContractNLI,
        "scrolls_govreport": GovReport,
        "scrolls_summscreenfd": SummScreenFD,
-        "scrolls_qmsum": QMSum
+        "scrolls_qmsum": QMSum,
    }
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -76,8 +76,16 @@ class TriviaQA(Task):
        return continuation
    def process_results(self, doc, results):
-        continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation))
+        continuation = (
-        list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in doc["answer"]["aliases"]]
+            results[0]
+            .strip()
+            .lower()
+            .translate(str.maketrans("", "", string.punctuation))
+        )
+        list_of_candidates = [
+            alias.lower().translate(str.maketrans("", "", string.punctuation))
+            for alias in doc["answer"]["aliases"]
+        ]
        return {"em": float(continuation in list_of_candidates)}
    def aggregation(self):

--- a/main.py
+++ b/main.py
@@ -12,17 +12,27 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
+    parser.add_argument(
+        "--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)
+    )
    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=str, default=None)
-    parser.add_argument("--max_batch_size", type=int, default=None,
+    parser.add_argument(
-                        help="Maximal batch size to try with --batch_size auto")
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=float, default=None,
+    parser.add_argument(
-                        help="Limit the number of examples per task. "
+        "--limit",
-                             "If <1, limit is a percentage of the total number of examples.")
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
@@ -77,7 +87,9 @@ def main():
    print(dumped)
    if args.output_path:
-        os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
+        dirname = os.path.dirname(args.output_path)
+        if dirname:
+            os.makedirs(dirname, exist_ok=True)
        with open(args.output_path, "w") as f:
            f.write(dumped)

--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -9,7 +9,12 @@ from lm_eval import tasks, utils
 seq2seq_models = ["google/flan-t5-small"]
-causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
+causal_models = [
+    "gpt2",
+    "facebook/opt-125m",
+    "EleutherAI/gpt-neo-125m",
+    "EleutherAI/pythia-160m",
+]
 model_names = seq2seq_models + causal_models
@@ -50,22 +55,41 @@ def eval_models(args, branch=None):
    results = {}
    for model in args.models:
-        model_type = "hf-causal-experimental" if model in causal_models \
+        model_type = (
-            else "hf-seq2seq" if model in seq2seq_models else args.model
+            "hf-causal-experimental"
+            if model in causal_models
+            else "hf-seq2seq"
+            if model in seq2seq_models
+            else args.model
+        )
        model_args = f"pretrained={model},{args.model_args}"
        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
-        tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
+        tasks = (
+            args.tasks
+            if model in causal_models or model_type == "hf-causal-experimental"
            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
+        )
        # TODO: OOM with auto for seq2seq models, also can OOM with llama
-        batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
+        batch_size = (
-            else 64 if args.batch_size == "auto" else args.batch_size
+            args.batch_size
-        output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+            if model in causal_models or model_type == "hf-causal-experimental"
+            else 64
-        command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
+            if args.batch_size == "auto"
-                  f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
+            else args.batch_size
-                  f"--batch_size {batch_size} --no_cache --output_path {output_path}"
+        )
+        output_path = (
-        print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
+            f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+        )
+        command = (
+            f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
+            f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
+            f"--batch_size {batch_size} --no_cache --output_path {output_path}"
+        )
+        print(
+            f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
+        )
        ret = os.system(command)
@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task):
 def main():
    args = parse_args()
-    args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
+    args.branches = (
+        args.branches.split(",") if type(args.branches) == str else args.branches
+    )
    args.models = args.models.split(",") if type(args.models) == str else args.models
-    args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
+    args.tasks = (
-        else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
+        tasks.ALL_TASKS
+        if args.tasks == "all_tasks"
+        else utils.pattern_match(
+            args.tasks.split(",") if type(args.tasks) == str else args.tasks,
+            tasks.ALL_TASKS,
+        )
+    )
    global initial_branch
-    initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
+    initial_branch = (
+        subprocess.check_output("git branch --show-current", shell=True)
+        .decode("ascii")
+        .strip()
+    )
    # TODO: implement proper timing for each task
    # TODO: reduce IO by sharing tasks between models?
@@ -132,10 +168,16 @@ def main():
    print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
    print(f"|--|{'--|' * len(args.models)}")
    for task in args.tasks:
-        print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
+        print(
+            f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
+        )
        for branch, branch_results, branch_runtime in runs:
-            print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
+            print(
-            print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
+                f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
+            )
+            print(
+                f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
+            )
    print("")
    print("|branch|runtime|%|")

--- a/setup.py
+++ b/setup.py
@@ -12,10 +12,8 @@ setuptools.setup(
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/EleutherAI/lm-evaluation-harness",
-    packages=setuptools.find_packages(),
+    packages=setuptools.find_packages(exclude=["scripts.*", "scripts"]),
-    package_data={
+    package_data={"lm_eval": ["**/*.json"]},
-        "lm_eval": ["**/*.json"]
-    },
    include_package_data=True,
    classifiers=[
        "Development Status :: 3 - Alpha",