Merge pull request #876 from jonabur/output_bugfix

fix bug with output path in CWD

Merge pull request #876 from jonabur/output_bugfix
fix bug with output path in CWD
2c18e367 · Lintang Sutawika · GitHub · 00209e10 · 93cbffa5 · 2c18e367
Unverified Commit 2c18e367 authored Sep 21, 2023 by Lintang Sutawika Committed by GitHub Sep 21, 2023
18 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,6 +25,3 @@ WORKDIR /lm-evaluation-harness
 RUN pip install --no-cache-dir -e .
 ### Run bash
 CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -309,7 +309,9 @@ class BaseLM(LM):
            if override_bs is not None
            else 0,
            fn=_batch_scheduler
-            if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
            else None,
        ):
            inps = []
@@ -375,7 +377,9 @@ class BaseLM(LM):
                # Slice to original seq length
                contlen = len(cont_toks)
-                inplen = inplen + (logits.shape[0] - padding_length) # if "virtual tokens" (from prompt tuning) are added, inplen is larger
+                inplen = inplen + (
+                    logits.shape[0] - padding_length
+                )  # if "virtual tokens" (from prompt tuning) are added, inplen is larger
                logits = logits[inplen - contlen : inplen].unsqueeze(
                    0
                )  # [1, seq, vocab]

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -74,7 +74,12 @@ def simple_evaluate(
        if model_args is None:
            model_args = ""
        lm = lm_eval.models.get_model(model).create_from_arg_string(
-            model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
+            model_args,
+            {
+                "batch_size": batch_size,
+                "max_batch_size": max_batch_size,
+                "device": device,
+            },
        )
    elif isinstance(model, transformers.PreTrainedModel):
        lm = lm_eval.models.get_model("hf-causal")(
@@ -125,7 +130,9 @@ def simple_evaluate(
        "model_args": model_args,
        "num_fewshot": num_fewshot,
        "batch_size": batch_size,
-        "batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
+        "batch_sizes": list(lm.batch_sizes.values())
+        if hasattr(lm, "batch_sizes")
+        else [],
        "device": device,
        "no_cache": no_cache,
        "limit": limit,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -4,9 +4,7 @@ from typing import Optional, Union
 from lm_eval.base import BaseLM
-def _get_dtype(
+def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
-    dtype: Union[str, torch.dtype]
-) -> torch.dtype:
    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
    if isinstance(dtype, str) and dtype != "auto":
        # Convert `str` args torch dtype: `float16` -> `torch.float16`
@@ -33,11 +31,10 @@ class HFLM(BaseLM):
        max_length=None,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
-        dtype: Optional[Union[str, torch.dtype]]="auto",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
    ):
        super().__init__()
        # Initialize model
        if isinstance(pretrained, transformers.PreTrainedModel):
            self.model = pretrained
@@ -45,12 +42,8 @@ class HFLM(BaseLM):
            if tokenizer:
                assert isinstance(
-                        tokenizer,
+                    tokenizer, transformers.PreTrainedTokenizer
-                        transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
-                        ) or isinstance(
-                        tokenizer,
-                        transformers.PreTrainedTokenizerFast
-                        )
                self.tokenizer = tokenizer
            else:
                # Get tokenizer
@@ -66,7 +59,8 @@ class HFLM(BaseLM):
            # Initialize device
            assert isinstance(device, str)
            device_list = set(
-                ["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                ["cuda", "cpu"]
+                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
            )
            if device and device in device_list:
                self._device = torch.device(device)
@@ -97,7 +91,9 @@ class HFLM(BaseLM):
            )
        else:
-            raise TypeError('Parameter pretrained should be of type str or transformers.PreTrainedModel')
+            raise TypeError(
+                "Parameter pretrained should be of type str or transformers.PreTrainedModel"
+            )
        self.model.eval()
@@ -136,7 +132,6 @@ class HFLM(BaseLM):
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH
    @property
    def max_gen_toks(self):
        return 256
@@ -171,8 +166,10 @@ class HFLM(BaseLM):
    def _model_generate(self, context, max_length, eos_token_id):
        generation_kwargs = {"do_sample": False, "max_length": max_length}
        if eos_token_id is not None:
-            generation_kwargs['eos_token_id'] = eos_token_id
+            generation_kwargs["eos_token_id"] = eos_token_id
-            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
+            generation_kwargs[
+                "pad_token_id"
+            ] = eos_token_id  # setting eos_token_id as pad token
        return self.model.generate(context, **generation_kwargs)

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -266,7 +266,9 @@ class HuggingFaceAutoLM(BaseLM):
            try:
                self.model.to(self._device)
            except:
-                print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
+                print(
+                    "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
+                )
    def _create_auto_model(
        self,
@@ -292,7 +294,9 @@ class HuggingFaceAutoLM(BaseLM):
        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
        if not quantized:
            if load_in_4bit:
-                assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0"
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
            model_kwargs = {}
            if transformers.__version__ >= "4.30.0":
                model_kwargs["load_in_4bit"] = load_in_4bit
@@ -300,9 +304,13 @@ class HuggingFaceAutoLM(BaseLM):
                    if bnb_4bit_quant_type:
                        model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
                    if bnb_4bit_compute_dtype:
-                        model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype)
+                        model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(
+                            bnb_4bit_compute_dtype
+                        )
                    if bnb_4bit_use_double_quant:
-                        model_kwargs["bnb_4bit_use_double_quant"] = bnb_4bit_use_double_quant
+                        model_kwargs[
+                            "bnb_4bit_use_double_quant"
+                        ] = bnb_4bit_use_double_quant
            model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
                revision=revision + ("/" + subfolder if subfolder is not None else ""),
@@ -317,13 +325,16 @@ class HuggingFaceAutoLM(BaseLM):
            )
        else:
            from auto_gptq import AutoGPTQForCausalLM
            model = AutoGPTQForCausalLM.from_quantized(
                pretrained,
                model_basename=None if quantized == True else Path(quantized).stem,
                device_map=device_map,
                max_memory=max_memory,
                trust_remote_code=trust_remote_code,
-                use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
+                use_safetensors=True
+                if quantized == True
+                else quantized.endswith(".safetensors"),
                use_triton=gptq_use_triton,
                warmup_triton=gptq_use_triton,
                inject_fused_attention=inject_fused_attention,

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -330,11 +330,11 @@ TASK_REGISTRY = {
    "csatqa_rch": csatqa.RCH,
    "csatqa_li": csatqa.LI,
    "haerae_hi": haerae.HI,
-    "haerae_kgk":haerae.KGK,
+    "haerae_kgk": haerae.KGK,
-    "haerae_lw":haerae.LW,
+    "haerae_lw": haerae.LW,
-    "haerae_rc":haerae.RC,
+    "haerae_rc": haerae.RC,
-    "haerae_rw":haerae.RW,
+    "haerae_rw": haerae.RW,
-    "haerae_sn":haerae.SN,
+    "haerae_sn": haerae.SN,
    # Requires manual download
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,

--- a/lm_eval/tasks/babi.py
+++ b/lm_eval/tasks/babi.py
@@ -16,6 +16,7 @@ _CITATION = """
 }
 """
 class Babi(Task):
    VERSION = 0
    DATASET_PATH = "Muennighoff/babi"
@@ -43,9 +44,7 @@ class Babi(Task):
            return self.dataset["test"]
    def doc_to_text(self, doc):
-        return (
+        return doc["passage"] + doc["question"]
-            doc['passage'] + doc['question']
-        )
    def should_decontaminate(self):
        return False  # TODO Necessary?
@@ -54,7 +53,7 @@ class Babi(Task):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
    def doc_to_target(self, doc):
-        return " " + doc['answer']
+        return " " + doc["answer"]
    def construct_requests(self, doc, ctx):
        """Uses RequestFactory to construct Requests and returns an iterable of

--- a/lm_eval/tasks/ceval.py
+++ b/lm_eval/tasks/ceval.py
@@ -21,58 +21,58 @@ _CITATION = """
 SUBJECTS = {
-    "computer_network":"计算机网络",
+    "computer_network": "计算机网络",
-    "operating_system":"操作系统",
+    "operating_system": "操作系统",
-    "computer_architecture":"计算机组成",
+    "computer_architecture": "计算机组成",
-    "college_programming":"大学编程",
+    "college_programming": "大学编程",
-    "college_physics":"大学物理",
+    "college_physics": "大学物理",
-    "college_chemistry":"大学化学",
+    "college_chemistry": "大学化学",
-    "advanced_mathematics":"高等数学",
+    "advanced_mathematics": "高等数学",
-    "probability_and_statistics":"概率统计",
+    "probability_and_statistics": "概率统计",
-    "discrete_mathematics":"离散数学",
+    "discrete_mathematics": "离散数学",
-    "electrical_engineer":"注册电气工程师",
+    "electrical_engineer": "注册电气工程师",
-    "metrology_engineer":"注册计量师",
+    "metrology_engineer": "注册计量师",
-    "high_school_mathematics":"高中数学",
+    "high_school_mathematics": "高中数学",
-    "high_school_physics":"高中物理",
+    "high_school_physics": "高中物理",
-    "high_school_chemistry":"高中化学",
+    "high_school_chemistry": "高中化学",
-    "high_school_biology":"高中生物",
+    "high_school_biology": "高中生物",
-    "middle_school_mathematics":"初中数学",
+    "middle_school_mathematics": "初中数学",
-    "middle_school_biology":"初中生物",
+    "middle_school_biology": "初中生物",
-    "middle_school_physics":"初中物理",
+    "middle_school_physics": "初中物理",
-    "middle_school_chemistry":"初中化学",
+    "middle_school_chemistry": "初中化学",
-    "veterinary_medicine":"兽医学",
+    "veterinary_medicine": "兽医学",
-    "college_economics":"大学经济学",
+    "college_economics": "大学经济学",
-    "business_administration":"工商管理",
+    "business_administration": "工商管理",
-    "marxism":"马克思主义基本原理",
+    "marxism": "马克思主义基本原理",
-    "mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论",
+    "mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
-    "education_science":"教育学",
+    "education_science": "教育学",
-    "teacher_qualification":"教师资格",
+    "teacher_qualification": "教师资格",
-    "high_school_politics":"高中政治",
+    "high_school_politics": "高中政治",
-    "high_school_geography":"高中地理",
+    "high_school_geography": "高中地理",
-    "middle_school_politics":"初中政治",
+    "middle_school_politics": "初中政治",
-    "middle_school_geography":"初中地理",
+    "middle_school_geography": "初中地理",
-    "modern_chinese_history":"近代史纲要",
+    "modern_chinese_history": "近代史纲要",
-    "ideological_and_moral_cultivation":"思想道德修养与法律基础",
+    "ideological_and_moral_cultivation": "思想道德修养与法律基础",
-    "logic":"逻辑学",
+    "logic": "逻辑学",
-    "law":"法学",
+    "law": "法学",
-    "chinese_language_and_literature":"中国语言文学",
+    "chinese_language_and_literature": "中国语言文学",
-    "art_studies":"艺术学",
+    "art_studies": "艺术学",
-    "professional_tour_guide":"导游资格",
+    "professional_tour_guide": "导游资格",
-    "legal_professional":"法律职业资格",
+    "legal_professional": "法律职业资格",
-    "high_school_chinese":"高中语文",
+    "high_school_chinese": "高中语文",
-    "high_school_history":"高中历史",
+    "high_school_history": "高中历史",
-    "middle_school_history":"初中历史",
+    "middle_school_history": "初中历史",
-    "civil_servant":"公务员",
+    "civil_servant": "公务员",
-    "sports_science":"体育学",
+    "sports_science": "体育学",
-    "plant_protection":"植物保护",
+    "plant_protection": "植物保护",
-    "basic_medicine":"基础医学",
+    "basic_medicine": "基础医学",
-    "clinical_medicine":"临床医学",
+    "clinical_medicine": "临床医学",
-    "urban_and_rural_planner":"注册城乡规划师",
+    "urban_and_rural_planner": "注册城乡规划师",
-    "accountant":"注册会计师",
+    "accountant": "注册会计师",
-    "fire_engineer":"注册消防工程师",
+    "fire_engineer": "注册消防工程师",
-    "environmental_impact_assessment_engineer":"环境影响评价工程师",
+    "environmental_impact_assessment_engineer": "环境影响评价工程师",
-    "tax_accountant":"税务师",
+    "tax_accountant": "税务师",
-    "physician":"医师资格"
+    "physician": "医师资格",
 }
@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask):
    def validation_docs(self):
        if self.has_validation_docs():
-            return map(self._process_doc,self.dataset["val"])
+            return map(self._process_doc, self.dataset["val"])
    def test_docs(self):
        if self.has_test_docs():
-            return map(self._process_doc,self.dataset["test"])
+            return map(self._process_doc, self.dataset["test"])
    def _format_subject(self, subject):
        words = subject.split("_")
@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask):
    def fewshot_context(self, doc, num_fewshot, **kwargs):
        subject = self.DATASET_NAME
-        description= f"以下是中国关于{SUBJECTS[subject]}的单项选择题，请选出其中的正确答案。"
+        description = f"以下是中国关于{SUBJECTS[subject]}的单项选择题，请选出其中的正确答案。"
        kwargs["description"] = description
        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask):
            """
            question = doc["question"].strip()
-            choices = "".join(
+            choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
-                [f'{key}. {doc[key]}\n' for key in keys]
-            )
            prompt = f"{question}\n{choices}答案："
            return prompt
@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask):
        return {
            "query": format_example(doc, keys),
            "choices": keys,
-            "gold": ord(doc["answer"])-ord("A"),
+            "gold": ord(doc["answer"]) - ord("A"),
        }
    def fewshot_examples(self, k, rnd):

--- a/lm_eval/tasks/cmmlu.py
+++ b/lm_eval/tasks/cmmlu.py
@@ -32,16 +32,16 @@ SUBJECTS = {
    "chinese_driving_rule": "中国驾驶规则",
    "chinese_food_culture": "中国饮食文化",
    "chinese_foreign_policy": "中国外交政策",
-    "chinese_history":"中国历史",
+    "chinese_history": "中国历史",
    "chinese_literature": "中国文学",
    "chinese_teacher_qualification": "中国教师资格",
    "clinical_knowledge": "临床知识",
-    "college_actuarial_science":"大学精算学",
+    "college_actuarial_science": "大学精算学",
-    "college_education":"大学教育学",
+    "college_education": "大学教育学",
    "college_engineering_hydrology": "大学工程水文学",
    "college_law": "大学法律",
    "college_mathematics": "大学数学",
-    "college_medical_statistics":"大学医学统计",
+    "college_medical_statistics": "大学医学统计",
    "college_medicine": "大学医学",
    "computer_science": "计算机科学",
    "computer_security": "计算机安全",
@@ -50,8 +50,8 @@ SUBJECTS = {
    "economics": "经济学",
    "education": "教育学",
    "electrical_engineering": "电气工程",
-    "elementary_chinese":"小学语文",
+    "elementary_chinese": "小学语文",
-    "elementary_commonsense":"小学常识",
+    "elementary_commonsense": "小学常识",
    "elementary_information_and_technology": "小学信息技术",
    "elementary_mathematics": "初等数学",
    "ethnology": "民族学",
@@ -82,12 +82,12 @@ SUBJECTS = {
    "professional_medicine": "专业医学",
    "professional_psychology": "专业心理学",
    "public_relations": "公共关系",
-    "security_study":"安全研究",
+    "security_study": "安全研究",
    "sociology": "社会学",
    "sports_science": "体育学",
    "traditional_chinese_medicine": "中医中药",
    "virology": "病毒学",
-    "world_history":"世界历史",
+    "world_history": "世界历史",
    "world_religions": "世界宗教",
 }
@@ -128,11 +128,11 @@ class CmmluSubject(MultipleChoiceTask):
    def validation_docs(self):
        if self.has_validation_docs():
-            return map(self._process_doc,self.dataset["dev"])
+            return map(self._process_doc, self.dataset["dev"])
    def test_docs(self):
        if self.has_test_docs():
-            return map(self._process_doc,self.dataset["test"])
+            return map(self._process_doc, self.dataset["test"])
    def _format_subject(self, subject):
        words = subject.split("_")
@@ -140,7 +140,7 @@ class CmmluSubject(MultipleChoiceTask):
    def fewshot_context(self, doc, num_fewshot, **kwargs):
        subject = self.DATASET_NAME
-        description= f"以下是关于{SUBJECTS[subject]}的单项选择题，请直接给出正确答案的选项。"
+        description = f"以下是关于{SUBJECTS[subject]}的单项选择题，请直接给出正确答案的选项。"
        kwargs["description"] = description
        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
@@ -156,9 +156,7 @@ class CmmluSubject(MultipleChoiceTask):
            """
            question = doc["Question"].strip()
-            choices = "".join(
+            choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
-                [f'{key}. {doc[key]}\n' for key in keys]
-            )
            prompt = f"{question}\n{choices}答案："
            return prompt
@@ -166,7 +164,7 @@ class CmmluSubject(MultipleChoiceTask):
        return {
            "query": format_example(doc, keys),
            "choices": keys,
-            "gold": ord(doc["Answer"])-ord("A"),
+            "gold": ord(doc["Answer"]) - ord("A"),
        }
    def fewshot_examples(self, k, rnd):

--- a/lm_eval/tasks/csatqa.py
+++ b/lm_eval/tasks/csatqa.py
@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask):
 (1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
 ### Answer: 주어진 문제의 정답은"""
-        choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]]
+        choices = [
+            doc["option#1"],
+            doc["option#2"],
+            doc["option#3"],
+            doc["option#4"],
+            doc["option#5"],
+        ]
        out_doc = {
            "question": instruction,
-            "choices": ["(1)", "(2)","(3)","(4)","(5)"],
+            "choices": ["(1)", "(2)", "(3)", "(4)", "(5)"],
-            "gold": int(doc['gold'])-1,
+            "gold": int(doc["gold"]) - 1,
        }
        return out_doc
@@ -41,17 +47,22 @@ class CSATQA(MultipleChoiceTask):
 class WR(CSATQA):
    DATASET_NAME = "WR"
 class GR(CSATQA):
    DATASET_NAME = "GR"
 class RCS(CSATQA):
    DATASET_NAME = "RCS"
 class RCSS(CSATQA):
    DATASET_NAME = "RCSS"
 class RCH(CSATQA):
    DATASET_NAME = "RCH"
 class LI(CSATQA):
    DATASET_NAME = "LI"
--- a/lm_eval/tasks/haerae.py
+++ b/lm_eval/tasks/haerae.py
@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask):
        out_doc = {
            "query": doc["query"],
            "choices": choices,
-            "gold": int(doc['gold'])-1,
+            "gold": int(doc["gold"]) - 1,
        }
        return out_doc

--- a/lm_eval/tasks/nqopen.py
+++ b/lm_eval/tasks/nqopen.py
@@ -86,10 +86,10 @@ class NQOpen(Task):
    def _normalize_answer(self, text):
        # Lowercase and remove punctuation, strip whitespace
-        text = text.strip().lower().translate(str.maketrans('', '', string.punctuation))
+        text = text.strip().lower().translate(str.maketrans("", "", string.punctuation))
        # Remove articles, resulting in duplicate whitespace
-        text = regex.sub(r'\b(a|an|the)\b', ' ', text)
+        text = regex.sub(r"\b(a|an|the)\b", " ", text)
        # Remove duplicate whitespace
        text = " ".join(text.split())
@@ -109,9 +109,7 @@ class NQOpen(Task):
        continuation = self._normalize_answer(results[0])
        answers = [self._normalize_answer(answer) for answer in doc["answer"]]
-        return {
+        return {"em": float(continuation in answers)}
-            "em": float(continuation in answers)
-        }
    def aggregation(self):
        """

--- a/lm_eval/tasks/scrolls.py
+++ b/lm_eval/tasks/scrolls.py
@@ -72,9 +72,14 @@ def _download_metric():
    import os
    import shutil
    from huggingface_hub import hf_hub_download
-    scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py")
+    scrolls_metric_path = hf_hub_download(
+        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+    )
    updated_scrolls_metric_path = (
-        os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
+        os.path.dirname(scrolls_metric_path)
+        + os.path.basename(scrolls_metric_path).replace(".", "_")
+        + ".py"
    )
    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
    return updated_scrolls_metric_path
@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc):
        "input": input,
        "outputs": doc["outputs"],
        "question": input[0:split],
-        "text": input[split + 2:]
+        "text": input[split + 2 :],
    }
@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset):
    indices_to_keep = []
    id_to_idx = {}
    outputs = []
-    for i, (id_, output) in enumerate(zip(untokenized_dataset["id"], untokenized_dataset["output"])):
+    for i, (id_, output) in enumerate(
+        zip(untokenized_dataset["id"], untokenized_dataset["output"])
+    ):
        if id_ in id_to_idx:
            outputs[id_to_idx[id_]].append(output)
            continue
@@ -119,9 +126,11 @@ def _num_cpu_cores():
    # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
    try:
        import psutil
        return psutil.cpu_count(logical=False)
    except ImportError:
        import os
        return len(os.sched_getaffinity(0))
@@ -135,7 +144,11 @@ class _SCROLLSTask(Task):
    def __init__(self, no_metric=False):
        super().__init__()
-        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) if not no_metric else None
+        self.metric = (
+            load_metric(_download_metric(), config_name=self.DATASET_NAME)
+            if not no_metric
+            else None
+        )
    def has_training_docs(self):
        return True
@@ -176,7 +189,10 @@ class _SCROLLSTask(Task):
        that are less than `max_tokens` when tokenized by each tokenizer
        """
-        tokenizers = [AutoTokenizer.from_pretrained(tokenizer) for tokenizer in self.PRUNE_TOKENIZERS]
+        tokenizers = [
+            AutoTokenizer.from_pretrained(tokenizer)
+            for tokenizer in self.PRUNE_TOKENIZERS
+        ]
        cache = {}
        def _filter(sample):
@@ -210,18 +226,21 @@ class _SCROLLSTask(Task):
    def _make_compute_metrics(self, value):
        def compute_metrics(samples):
            predictions, references = zip(*samples)  # unzip, if you will
-            computed = self.metric.compute(predictions=predictions, references=references)
+            computed = self.metric.compute(
+                predictions=predictions, references=references
+            )
            return computed[value]
        return compute_metrics
    def aggregation(self):
        return {
-            key: self._make_compute_metrics(value) for key, value in self._scrolls_metrics().items()
+            key: self._make_compute_metrics(value)
+            for key, value in self._scrolls_metrics().items()
        }
 class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
    def __init__(self):
        super().__init__(no_metric=True)
@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
        return None
    def aggregation(self):
-        return {
+        return {"em": mean, "acc": mean, "acc_norm": mean}
-            "em": mean,
-            "acc": mean,
-            "acc_norm": mean
-        }
    def higher_is_better(self):
-        return {
+        return {"em": True, "acc": True, "acc_norm": True}
-            "em": True,
-            "acc": True,
-            "acc_norm": True
-        }
    def process_results(self, doc, results):
        gold = doc["gold"]
@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
 class _SCROLLSSummaryTask(_SCROLLSTask):
    def _process_doc(self, doc):
        return [doc]
    def _scrolls_metrics(self):
-        return {"rouge1": "rouge/rouge1", "rouge2": "rouge/rouge2", "rougeL": "rouge/rougeL"}
+        return {
+            "rouge1": "rouge/rouge1",
+            "rouge2": "rouge/rouge2",
+            "rougeL": "rouge/rougeL",
+        }
    def process_results(self, doc, results):
        return {
            "rouge1": (results[0], doc["outputs"]),
            "rouge2": (results[0], doc["outputs"]),
-            "rougeL": (results[0], doc["outputs"])
+            "rougeL": (results[0], doc["outputs"]),
        }
    def construct_requests(self, doc, ctx):
-        return [rf.greedy_until(ctx, {'until': ["\n"]})]
+        return [rf.greedy_until(ctx, {"until": ["\n"]})]
    def doc_to_text(self, doc):
        return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask):
    def _process_doc(self, doc):
        doc = _process_doc_prepended_question(doc)
-        doc["is_yes_no"] = reduce(lambda prev, cur: prev and squad_metrics.normalize_answer(cur)
+        doc["is_yes_no"] = reduce(
-                                  in ["yes", "no"], doc["outputs"], True)
+            lambda prev, cur: prev
+            and squad_metrics.normalize_answer(cur) in ["yes", "no"],
+            doc["outputs"],
+            True,
+        )
        return [doc]
    def _scrolls_metrics(self):
@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask):
            prediction = "Unanswerable"
        else:
            prediction = results[0]
-        return {
+        return {"f1": (prediction, doc["outputs"])}
-            "f1": (prediction, doc["outputs"])
-        }
    def construct_requests(self, doc, ctx):
        if doc["is_yes_no"]:
@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask):
            ll_no, _ = rf.loglikelihood(ctx, " no")
            return [ll_yes, ll_no]
        else:
-            return [rf.greedy_until(ctx, {'until': ["\n"]})]
+            return [rf.greedy_until(ctx, {"until": ["\n"]})]
 class QuALITY(_SCROLLSMultipleChoiceTask):
@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
        choices_text = doc["text"][:split]
        doc["text"] = doc["text"][split:].strip()
-        doc["choices"] = [QuALITY._normalize_answer(choice) for choice in re.split(
+        doc["choices"] = [
-            QuALITY._multiple_choice_pattern, choices_text)[1:]]
+            QuALITY._normalize_answer(choice)
+            for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
+        ]
        doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
        return [doc]
@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask):
        return self._process_doc(doc)[0]["text"]
    def process_results(self, doc, results):
-        return {
+        return {"f1": (results[0], doc["outputs"])}
-            "f1": (results[0], doc["outputs"])
-        }
    def construct_requests(self, doc, ctx):
-        return [rf.greedy_until(ctx, {'until': ["\n"]})]
+        return [rf.greedy_until(ctx, {"until": ["\n"]})]
 class ContractNLI(_SCROLLSMultipleChoiceTask):
@@ -439,5 +455,5 @@ def construct_tasks():
        "scrolls_contractnli": ContractNLI,
        "scrolls_govreport": GovReport,
        "scrolls_summscreenfd": SummScreenFD,
-        "scrolls_qmsum": QMSum
+        "scrolls_qmsum": QMSum,
    }
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -76,8 +76,16 @@ class TriviaQA(Task):
        return continuation
    def process_results(self, doc, results):
-        continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation))
+        continuation = (
-        list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in doc["answer"]["aliases"]]
+            results[0]
+            .strip()
+            .lower()
+            .translate(str.maketrans("", "", string.punctuation))
+        )
+        list_of_candidates = [
+            alias.lower().translate(str.maketrans("", "", string.punctuation))
+            for alias in doc["answer"]["aliases"]
+        ]
        return {"em": float(continuation in list_of_candidates)}
    def aggregation(self):

--- a/main.py
+++ b/main.py
@@ -12,17 +12,27 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
+    parser.add_argument(
+        "--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)
+    )
    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=str, default=None)
-    parser.add_argument("--max_batch_size", type=int, default=None,
+    parser.add_argument(
-                        help="Maximal batch size to try with --batch_size auto")
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=float, default=None,
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
        help="Limit the number of examples per task. "
-                             "If <1, limit is a percentage of the total number of examples.")
+        "If <1, limit is a percentage of the total number of examples.",
+    )
    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
@@ -77,7 +87,9 @@ def main():
    print(dumped)
    if args.output_path:
-        os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
+        dirname = os.path.dirname(args.output_path)
+        if dirname:
+            os.makedirs(dirname, exist_ok=True)
        with open(args.output_path, "w") as f:
            f.write(dumped)

--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -9,7 +9,12 @@ from lm_eval import tasks, utils
 seq2seq_models = ["google/flan-t5-small"]
-causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
+causal_models = [
+    "gpt2",
+    "facebook/opt-125m",
+    "EleutherAI/gpt-neo-125m",
+    "EleutherAI/pythia-160m",
+]
 model_names = seq2seq_models + causal_models
@@ -50,22 +55,41 @@ def eval_models(args, branch=None):
    results = {}
    for model in args.models:
-        model_type = "hf-causal-experimental" if model in causal_models \
+        model_type = (
-            else "hf-seq2seq" if model in seq2seq_models else args.model
+            "hf-causal-experimental"
+            if model in causal_models
+            else "hf-seq2seq"
+            if model in seq2seq_models
+            else args.model
+        )
        model_args = f"pretrained={model},{args.model_args}"
        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
-        tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
+        tasks = (
+            args.tasks
+            if model in causal_models or model_type == "hf-causal-experimental"
            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
+        )
        # TODO: OOM with auto for seq2seq models, also can OOM with llama
-        batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
+        batch_size = (
-            else 64 if args.batch_size == "auto" else args.batch_size
+            args.batch_size
-        output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+            if model in causal_models or model_type == "hf-causal-experimental"
+            else 64
-        command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
+            if args.batch_size == "auto"
-                  f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
+            else args.batch_size
+        )
+        output_path = (
+            f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+        )
+        command = (
+            f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
+            f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
            f"--batch_size {batch_size} --no_cache --output_path {output_path}"
+        )
-        print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
+        print(
+            f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
+        )
        ret = os.system(command)
@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task):
 def main():
    args = parse_args()
-    args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
+    args.branches = (
+        args.branches.split(",") if type(args.branches) == str else args.branches
+    )
    args.models = args.models.split(",") if type(args.models) == str else args.models
-    args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
+    args.tasks = (
-        else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
+        tasks.ALL_TASKS
+        if args.tasks == "all_tasks"
+        else utils.pattern_match(
+            args.tasks.split(",") if type(args.tasks) == str else args.tasks,
+            tasks.ALL_TASKS,
+        )
+    )
    global initial_branch
-    initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
+    initial_branch = (
+        subprocess.check_output("git branch --show-current", shell=True)
+        .decode("ascii")
+        .strip()
+    )
    # TODO: implement proper timing for each task
    # TODO: reduce IO by sharing tasks between models?
@@ -132,10 +168,16 @@ def main():
    print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
    print(f"|--|{'--|' * len(args.models)}")
    for task in args.tasks:
-        print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
+        print(
+            f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
+        )
        for branch, branch_results, branch_runtime in runs:
-            print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
+            print(
-            print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
+                f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
+            )
+            print(
+                f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
+            )
    print("")
    print("|branch|runtime|%|")

--- a/setup.py
+++ b/setup.py
@@ -13,9 +13,7 @@ setuptools.setup(
    long_description_content_type="text/markdown",
    url="https://github.com/EleutherAI/lm-evaluation-harness",
    packages=setuptools.find_packages(),
-    package_data={
+    package_data={"lm_eval": ["**/*.json"]},
-        "lm_eval": ["**/*.json"]
-    },
    include_package_data=True,
    classifiers=[
        "Development Status :: 3 - Alpha",