Merge pull request #876 from jonabur/output_bugfix

fix bug with output path in CWD

Merge pull request #876 from jonabur/output_bugfix
fix bug with output path in CWD
2c18e367 · Lintang Sutawika · GitHub · 00209e10 · 93cbffa5 · 2c18e367
Unverified Commit 2c18e367 authored Sep 21, 2023 by Lintang Sutawika Committed by GitHub Sep 21, 2023
18 changed files
--- a/Dockerfile
+++ b/Dockerfile
 FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04


-### Install python 3.10 and set it as default python interpreter 
+### Install python 3.10 and set it as default python interpreter
 RUN  apt update &&  apt install software-properties-common -y && \
 add-apt-repository ppa:deadsnakes/ppa -y &&  apt update && \
 apt install curl -y && \
@@ -13,7 +13,7 @@ curl -Ss https://bootstrap.pypa.io/get-pip.py | python3.10 && \
 apt-get clean && rm -rf /var/lib/apt/lists/


-### Copy files 
+### Copy files
 COPY . /lm-evaluation-harness/

 ### Set working directory
@@ -22,9 +22,6 @@ WORKDIR /lm-evaluation-harness


 ### Install requirements
-RUN pip install --no-cache-dir -e . 
+RUN pip install --no-cache-dir -e .
 ### Run bash
 CMD ["/bin/bash"]
-
-
-
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by:
 1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.

-If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with: 
+If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
 - A shell command to run the task in the `master` branch, and what the score is
 - A shell command to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.


--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -309,7 +309,9 @@ class BaseLM(LM):
            if override_bs is not None
            else 0,
            fn=_batch_scheduler
-            if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
            else None,
        ):
            inps = []
@@ -375,7 +377,9 @@ class BaseLM(LM):

                # Slice to original seq length
                contlen = len(cont_toks)
-                inplen = inplen + (logits.shape[0] - padding_length) # if "virtual tokens" (from prompt tuning) are added, inplen is larger
+                inplen = inplen + (
+                    logits.shape[0] - padding_length
+                )  # if "virtual tokens" (from prompt tuning) are added, inplen is larger
                logits = logits[inplen - contlen : inplen].unsqueeze(
                    0
                )  # [1, seq, vocab]

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -74,14 +74,19 @@ def simple_evaluate(
        if model_args is None:
            model_args = ""
        lm = lm_eval.models.get_model(model).create_from_arg_string(
-            model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
+            model_args,
+            {
+                "batch_size": batch_size,
+                "max_batch_size": max_batch_size,
+                "device": device,
+            },
        )
    elif isinstance(model, transformers.PreTrainedModel):
        lm = lm_eval.models.get_model("hf-causal")(
-                pretrained=model,
-                batch_size=batch_size,
-                max_batch_size=max_batch_size,
-                )
+            pretrained=model,
+            batch_size=batch_size,
+            max_batch_size=max_batch_size,
+        )
        no_cache = True
    else:
        assert isinstance(model, lm_eval.base.LM)
@@ -125,7 +130,9 @@ def simple_evaluate(
        "model_args": model_args,
        "num_fewshot": num_fewshot,
        "batch_size": batch_size,
-        "batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
+        "batch_sizes": list(lm.batch_sizes.values())
+        if hasattr(lm, "batch_sizes")
+        else [],
        "device": device,
        "no_cache": no_cache,
        "limit": limit,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -4,9 +4,7 @@ from typing import Optional, Union
 from lm_eval.base import BaseLM


-def _get_dtype(
-    dtype: Union[str, torch.dtype]
-) -> torch.dtype:
+def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
    if isinstance(dtype, str) and dtype != "auto":
        # Convert `str` args torch dtype: `float16` -> `torch.float16`
@@ -33,11 +31,10 @@ class HFLM(BaseLM):
        max_length=None,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
-        dtype: Optional[Union[str, torch.dtype]]="auto",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
    ):
        super().__init__()

-
        # Initialize model
        if isinstance(pretrained, transformers.PreTrainedModel):
            self.model = pretrained
@@ -45,28 +42,25 @@ class HFLM(BaseLM):

            if tokenizer:
                assert isinstance(
-                        tokenizer,
-                        transformers.PreTrainedTokenizer
-                        ) or isinstance(
-                        tokenizer,
-                        transformers.PreTrainedTokenizerFast
-                        )
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
                self.tokenizer = tokenizer
            else:
                # Get tokenizer
                model_name = self.model.name_or_path
                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                        model_name,
-                        revision=revision,
-                        trust_remote_code=trust_remote_code,
-                        )
+                    model_name,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                )

        elif isinstance(pretrained, str):

            # Initialize device
            assert isinstance(device, str)
            device_list = set(
-                ["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                ["cuda", "cpu"]
+                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
            )
            if device and device in device_list:
                self._device = torch.device(device)
@@ -83,21 +77,23 @@ class HFLM(BaseLM):

            # Initialize new model and tokenizer instances
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
-                    pretrained,
-                    load_in_8bit=load_in_8bit,
-                    low_cpu_mem_usage=low_cpu_mem_usage,
-                    revision=revision,
-                    torch_dtype=_get_dtype(dtype),
-                    trust_remote_code=trust_remote_code,
-                    ).to(self.device)
+                pretrained,
+                load_in_8bit=load_in_8bit,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                revision=revision,
+                torch_dtype=_get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+            ).to(self.device)
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                    tokenizer if tokenizer else pretrained,
-                    revision=revision,
-                    trust_remote_code=trust_remote_code,
-                    )
+                tokenizer if tokenizer else pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+            )

        else:
-            raise TypeError('Parameter pretrained should be of type str or transformers.PreTrainedModel')
+            raise TypeError(
+                "Parameter pretrained should be of type str or transformers.PreTrainedModel"
+            )

        self.model.eval()

@@ -124,7 +120,7 @@ class HFLM(BaseLM):

    @property
    def max_length(self):
-        if self._max_length: # if max length manually set, return it
+        if self._max_length:  # if max length manually set, return it
            return self._max_length
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
        for attr in seqlen_config_attrs:
@@ -136,7 +132,6 @@ class HFLM(BaseLM):
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH

-
    @property
    def max_gen_toks(self):
        return 256
@@ -171,8 +166,10 @@ class HFLM(BaseLM):
    def _model_generate(self, context, max_length, eos_token_id):
        generation_kwargs = {"do_sample": False, "max_length": max_length}
        if eos_token_id is not None:
-            generation_kwargs['eos_token_id'] = eos_token_id
-            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
+            generation_kwargs["eos_token_id"] = eos_token_id
+            generation_kwargs[
+                "pad_token_id"
+            ] = eos_token_id  # setting eos_token_id as pad token
        return self.model.generate(context, **generation_kwargs)



--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -266,7 +266,9 @@ class HuggingFaceAutoLM(BaseLM):
            try:
                self.model.to(self._device)
            except:
-                print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
+                print(
+                    "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
+                )

    def _create_auto_model(
        self,
@@ -292,7 +294,9 @@ class HuggingFaceAutoLM(BaseLM):
        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
        if not quantized:
            if load_in_4bit:
-                assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0"
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
            model_kwargs = {}
            if transformers.__version__ >= "4.30.0":
                model_kwargs["load_in_4bit"] = load_in_4bit
@@ -300,9 +304,13 @@ class HuggingFaceAutoLM(BaseLM):
                    if bnb_4bit_quant_type:
                        model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
                    if bnb_4bit_compute_dtype:
-                        model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype)
+                        model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(
+                            bnb_4bit_compute_dtype
+                        )
                    if bnb_4bit_use_double_quant:
-                        model_kwargs["bnb_4bit_use_double_quant"] = bnb_4bit_use_double_quant
+                        model_kwargs[
+                            "bnb_4bit_use_double_quant"
+                        ] = bnb_4bit_use_double_quant
            model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
                revision=revision + ("/" + subfolder if subfolder is not None else ""),
@@ -317,13 +325,16 @@ class HuggingFaceAutoLM(BaseLM):
            )
        else:
            from auto_gptq import AutoGPTQForCausalLM
+
            model = AutoGPTQForCausalLM.from_quantized(
                pretrained,
                model_basename=None if quantized == True else Path(quantized).stem,
                device_map=device_map,
                max_memory=max_memory,
                trust_remote_code=trust_remote_code,
-                use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
+                use_safetensors=True
+                if quantized == True
+                else quantized.endswith(".safetensors"),
                use_triton=gptq_use_triton,
                warmup_triton=gptq_use_triton,
                inject_fused_attention=inject_fused_attention,

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -330,11 +330,11 @@ TASK_REGISTRY = {
    "csatqa_rch": csatqa.RCH,
    "csatqa_li": csatqa.LI,
    "haerae_hi": haerae.HI,
-    "haerae_kgk":haerae.KGK,
-    "haerae_lw":haerae.LW,
-    "haerae_rc":haerae.RC,
-    "haerae_rw":haerae.RW,
-    "haerae_sn":haerae.SN,
+    "haerae_kgk": haerae.KGK,
+    "haerae_lw": haerae.LW,
+    "haerae_rc": haerae.RC,
+    "haerae_rw": haerae.RW,
+    "haerae_sn": haerae.SN,
    # Requires manual download
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,

--- a/lm_eval/tasks/babi.py
+++ b/lm_eval/tasks/babi.py
@@ -16,6 +16,7 @@ _CITATION = """
 }
 """

+
 class Babi(Task):
    VERSION = 0
    DATASET_PATH = "Muennighoff/babi"
@@ -43,18 +44,16 @@ class Babi(Task):
            return self.dataset["test"]

    def doc_to_text(self, doc):
-        return (
-            doc['passage'] + doc['question']
-        )
+        return doc["passage"] + doc["question"]

    def should_decontaminate(self):
-        return False # TODO Necessary?
+        return False  # TODO Necessary?

    def doc_to_decontamination_query(self, doc):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"

    def doc_to_target(self, doc):
-        return " " + doc['answer']
+        return " " + doc["answer"]

    def construct_requests(self, doc, ctx):
        """Uses RequestFactory to construct Requests and returns an iterable of

--- a/lm_eval/tasks/ceval.py
+++ b/lm_eval/tasks/ceval.py
@@ -12,7 +12,7 @@ from lm_eval.base import MultipleChoiceTask

 _CITATION = """
 @article{huang2023ceval,
-    title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models}, 
+    title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
    author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
    journal={arXiv preprint arXiv:2305.08322},
    year={2023}
@@ -21,58 +21,58 @@ _CITATION = """


 SUBJECTS = {
-    "computer_network":"计算机网络",
-    "operating_system":"操作系统",
-    "computer_architecture":"计算机组成",
-    "college_programming":"大学编程",
-    "college_physics":"大学物理",
-    "college_chemistry":"大学化学",
-    "advanced_mathematics":"高等数学",
-    "probability_and_statistics":"概率统计",
-    "discrete_mathematics":"离散数学",
-    "electrical_engineer":"注册电气工程师",
-    "metrology_engineer":"注册计量师",
-    "high_school_mathematics":"高中数学",
-    "high_school_physics":"高中物理",
-    "high_school_chemistry":"高中化学",
-    "high_school_biology":"高中生物",
-    "middle_school_mathematics":"初中数学",
-    "middle_school_biology":"初中生物",
-    "middle_school_physics":"初中物理",
-    "middle_school_chemistry":"初中化学",
-    "veterinary_medicine":"兽医学",
-    "college_economics":"大学经济学",
-    "business_administration":"工商管理",
-    "marxism":"马克思主义基本原理",
-    "mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论",
-    "education_science":"教育学",
-    "teacher_qualification":"教师资格",
-    "high_school_politics":"高中政治",
-    "high_school_geography":"高中地理",
-    "middle_school_politics":"初中政治",
-    "middle_school_geography":"初中地理",
-    "modern_chinese_history":"近代史纲要",
-    "ideological_and_moral_cultivation":"思想道德修养与法律基础",
-    "logic":"逻辑学",
-    "law":"法学",
-    "chinese_language_and_literature":"中国语言文学",
-    "art_studies":"艺术学",
-    "professional_tour_guide":"导游资格",
-    "legal_professional":"法律职业资格",
-    "high_school_chinese":"高中语文",
-    "high_school_history":"高中历史",
-    "middle_school_history":"初中历史",
-    "civil_servant":"公务员",
-    "sports_science":"体育学",
-    "plant_protection":"植物保护",
-    "basic_medicine":"基础医学",
-    "clinical_medicine":"临床医学",
-    "urban_and_rural_planner":"注册城乡规划师",
-    "accountant":"注册会计师",
-    "fire_engineer":"注册消防工程师",
-    "environmental_impact_assessment_engineer":"环境影响评价工程师",
-    "tax_accountant":"税务师",
-    "physician":"医师资格"
+    "computer_network": "计算机网络",
+    "operating_system": "操作系统",
+    "computer_architecture": "计算机组成",
+    "college_programming": "大学编程",
+    "college_physics": "大学物理",
+    "college_chemistry": "大学化学",
+    "advanced_mathematics": "高等数学",
+    "probability_and_statistics": "概率统计",
+    "discrete_mathematics": "离散数学",
+    "electrical_engineer": "注册电气工程师",
+    "metrology_engineer": "注册计量师",
+    "high_school_mathematics": "高中数学",
+    "high_school_physics": "高中物理",
+    "high_school_chemistry": "高中化学",
+    "high_school_biology": "高中生物",
+    "middle_school_mathematics": "初中数学",
+    "middle_school_biology": "初中生物",
+    "middle_school_physics": "初中物理",
+    "middle_school_chemistry": "初中化学",
+    "veterinary_medicine": "兽医学",
+    "college_economics": "大学经济学",
+    "business_administration": "工商管理",
+    "marxism": "马克思主义基本原理",
+    "mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
+    "education_science": "教育学",
+    "teacher_qualification": "教师资格",
+    "high_school_politics": "高中政治",
+    "high_school_geography": "高中地理",
+    "middle_school_politics": "初中政治",
+    "middle_school_geography": "初中地理",
+    "modern_chinese_history": "近代史纲要",
+    "ideological_and_moral_cultivation": "思想道德修养与法律基础",
+    "logic": "逻辑学",
+    "law": "法学",
+    "chinese_language_and_literature": "中国语言文学",
+    "art_studies": "艺术学",
+    "professional_tour_guide": "导游资格",
+    "legal_professional": "法律职业资格",
+    "high_school_chinese": "高中语文",
+    "high_school_history": "高中历史",
+    "middle_school_history": "初中历史",
+    "civil_servant": "公务员",
+    "sports_science": "体育学",
+    "plant_protection": "植物保护",
+    "basic_medicine": "基础医学",
+    "clinical_medicine": "临床医学",
+    "urban_and_rural_planner": "注册城乡规划师",
+    "accountant": "注册会计师",
+    "fire_engineer": "注册消防工程师",
+    "environmental_impact_assessment_engineer": "环境影响评价工程师",
+    "tax_accountant": "税务师",
+    "physician": "医师资格",
 }


@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask):

    def validation_docs(self):
        if self.has_validation_docs():
-            return map(self._process_doc,self.dataset["val"])
+            return map(self._process_doc, self.dataset["val"])

    def test_docs(self):
        if self.has_test_docs():
-            return map(self._process_doc,self.dataset["test"])
+            return map(self._process_doc, self.dataset["test"])

    def _format_subject(self, subject):
        words = subject.split("_")
@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask):

    def fewshot_context(self, doc, num_fewshot, **kwargs):
        subject = self.DATASET_NAME
-        description= f"以下是中国关于{SUBJECTS[subject]}的单项选择题，请选出其中的正确答案。"
+        description = f"以下是中国关于{SUBJECTS[subject]}的单项选择题，请选出其中的正确答案。"
        kwargs["description"] = description
        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)

@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask):
            """

            question = doc["question"].strip()
-            choices = "".join(
-                [f'{key}. {doc[key]}\n' for key in keys]
-            )
+            choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
            prompt = f"{question}\n{choices}答案："
            return prompt

@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask):
        return {
            "query": format_example(doc, keys),
            "choices": keys,
-            "gold": ord(doc["answer"])-ord("A"),
+            "gold": ord(doc["answer"]) - ord("A"),
        }

    def fewshot_examples(self, k, rnd):

--- a/lm_eval/tasks/cmmlu.py
+++ b/lm_eval/tasks/cmmlu.py
@@ -32,16 +32,16 @@ SUBJECTS = {
    "chinese_driving_rule": "中国驾驶规则",
    "chinese_food_culture": "中国饮食文化",
    "chinese_foreign_policy": "中国外交政策",
-    "chinese_history":"中国历史",
+    "chinese_history": "中国历史",
    "chinese_literature": "中国文学",
    "chinese_teacher_qualification": "中国教师资格",
    "clinical_knowledge": "临床知识",
-    "college_actuarial_science":"大学精算学",
-    "college_education":"大学教育学",
+    "college_actuarial_science": "大学精算学",
+    "college_education": "大学教育学",
    "college_engineering_hydrology": "大学工程水文学",
    "college_law": "大学法律",
    "college_mathematics": "大学数学",
-    "college_medical_statistics":"大学医学统计",
+    "college_medical_statistics": "大学医学统计",
    "college_medicine": "大学医学",
    "computer_science": "计算机科学",
    "computer_security": "计算机安全",
@@ -50,8 +50,8 @@ SUBJECTS = {
    "economics": "经济学",
    "education": "教育学",
    "electrical_engineering": "电气工程",
-    "elementary_chinese":"小学语文",
-    "elementary_commonsense":"小学常识",
+    "elementary_chinese": "小学语文",
+    "elementary_commonsense": "小学常识",
    "elementary_information_and_technology": "小学信息技术",
    "elementary_mathematics": "初等数学",
    "ethnology": "民族学",
@@ -82,12 +82,12 @@ SUBJECTS = {
    "professional_medicine": "专业医学",
    "professional_psychology": "专业心理学",
    "public_relations": "公共关系",
-    "security_study":"安全研究",
+    "security_study": "安全研究",
    "sociology": "社会学",
    "sports_science": "体育学",
    "traditional_chinese_medicine": "中医中药",
    "virology": "病毒学",
-    "world_history":"世界历史",
+    "world_history": "世界历史",
    "world_religions": "世界宗教",
 }

@@ -128,11 +128,11 @@ class CmmluSubject(MultipleChoiceTask):

    def validation_docs(self):
        if self.has_validation_docs():
-            return map(self._process_doc,self.dataset["dev"])
+            return map(self._process_doc, self.dataset["dev"])

    def test_docs(self):
        if self.has_test_docs():
-            return map(self._process_doc,self.dataset["test"])
+            return map(self._process_doc, self.dataset["test"])

    def _format_subject(self, subject):
        words = subject.split("_")
@@ -140,7 +140,7 @@ class CmmluSubject(MultipleChoiceTask):

    def fewshot_context(self, doc, num_fewshot, **kwargs):
        subject = self.DATASET_NAME
-        description= f"以下是关于{SUBJECTS[subject]}的单项选择题，请直接给出正确答案的选项。"
+        description = f"以下是关于{SUBJECTS[subject]}的单项选择题，请直接给出正确答案的选项。"
        kwargs["description"] = description
        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)

@@ -156,9 +156,7 @@ class CmmluSubject(MultipleChoiceTask):
            """

            question = doc["Question"].strip()
-            choices = "".join(
-                [f'{key}. {doc[key]}\n' for key in keys]
-            )
+            choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
            prompt = f"{question}\n{choices}答案："
            return prompt

@@ -166,7 +164,7 @@ class CmmluSubject(MultipleChoiceTask):
        return {
            "query": format_example(doc, keys),
            "choices": keys,
-            "gold": ord(doc["Answer"])-ord("A"),
+            "gold": ord(doc["Answer"]) - ord("A"),
        }

    def fewshot_examples(self, k, rnd):

--- a/lm_eval/tasks/csatqa.py
+++ b/lm_eval/tasks/csatqa.py
@@ -16,7 +16,7 @@ class CSATQA(MultipleChoiceTask):

    def test_docs(self):
        return map(self._process_doc, self.dataset["test"])
-    
+
    def _process_doc(self, doc):
        instruction = f"""다음을 읽고 정답으로 알맞은 것을 고르시요.
 ### Context: {doc["context"]}
@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask):
 (1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
 ### Answer: 주어진 문제의 정답은"""

-        choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]]
+        choices = [
+            doc["option#1"],
+            doc["option#2"],
+            doc["option#3"],
+            doc["option#4"],
+            doc["option#5"],
+        ]
        out_doc = {
            "question": instruction,
-            "choices": ["(1)", "(2)","(3)","(4)","(5)"],
-            "gold": int(doc['gold'])-1,
+            "choices": ["(1)", "(2)", "(3)", "(4)", "(5)"],
+            "gold": int(doc["gold"]) - 1,
        }
        return out_doc

@@ -40,18 +46,23 @@ class CSATQA(MultipleChoiceTask):

 class WR(CSATQA):
    DATASET_NAME = "WR"
-    
+
+
 class GR(CSATQA):
    DATASET_NAME = "GR"

+
 class RCS(CSATQA):
    DATASET_NAME = "RCS"
-    
+
+
 class RCSS(CSATQA):
    DATASET_NAME = "RCSS"
-    
+
+
 class RCH(CSATQA):
    DATASET_NAME = "RCH"

+
 class LI(CSATQA):
    DATASET_NAME = "LI"
--- a/lm_eval/tasks/haerae.py
+++ b/lm_eval/tasks/haerae.py
@@ -16,7 +16,7 @@ class Haerae(MultipleChoiceTask):

    def test_docs(self):
        return map(self._process_doc, self.dataset["test"])
-    
+
    def _process_doc(self, doc):
        choices = [doc["o1"], doc["o2"], doc["o3"], doc["o4"]]
        if doc.get("o5") is not None:
@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask):
        out_doc = {
            "query": doc["query"],
            "choices": choices,
-            "gold": int(doc['gold'])-1,
+            "gold": int(doc["gold"]) - 1,
        }
        return out_doc


--- a/lm_eval/tasks/nqopen.py
+++ b/lm_eval/tasks/nqopen.py
@@ -58,7 +58,7 @@ class NQOpen(Task):

    def test_docs(self):
        raise NotImplementedError()
-    
+
    def doc_to_text(self, doc):
        return f"Q: {doc['question']}\nA:"

@@ -86,10 +86,10 @@ class NQOpen(Task):

    def _normalize_answer(self, text):
        # Lowercase and remove punctuation, strip whitespace
-        text = text.strip().lower().translate(str.maketrans('', '', string.punctuation))
+        text = text.strip().lower().translate(str.maketrans("", "", string.punctuation))

        # Remove articles, resulting in duplicate whitespace
-        text = regex.sub(r'\b(a|an|the)\b', ' ', text)
+        text = regex.sub(r"\b(a|an|the)\b", " ", text)

        # Remove duplicate whitespace
        text = " ".join(text.split())
@@ -108,10 +108,8 @@ class NQOpen(Task):
        """
        continuation = self._normalize_answer(results[0])
        answers = [self._normalize_answer(answer) for answer in doc["answer"]]
-        
-        return {
-            "em": float(continuation in answers)
-        }
+
+        return {"em": float(continuation in answers)}

    def aggregation(self):
        """
@@ -121,8 +119,8 @@ class NQOpen(Task):
        """
        return {
            "em": mean,
-        } 
-        
+        }
+
    def higher_is_better(self):
        """
        :returns: {str: bool}
@@ -131,4 +129,4 @@ class NQOpen(Task):
        """
        return {
            "em": True,
-        } 
+        }
--- a/lm_eval/tasks/scrolls.py
+++ b/lm_eval/tasks/scrolls.py
@@ -42,7 +42,7 @@ import re
 _CITATION = """
 @inproceedings{shaham-etal-2022-scrolls,
    title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
-    author = "Shaham, Uri  and 
+    author = "Shaham, Uri  and
      Segal, Elad  and
      Ivgi, Maor  and
      Efrat, Avia  and
@@ -72,9 +72,14 @@ def _download_metric():
    import os
    import shutil
    from huggingface_hub import hf_hub_download
-    scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py")
+
+    scrolls_metric_path = hf_hub_download(
+        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+    )
    updated_scrolls_metric_path = (
-        os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
+        os.path.dirname(scrolls_metric_path)
+        + os.path.basename(scrolls_metric_path).replace(".", "_")
+        + ".py"
    )
    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
    return updated_scrolls_metric_path
@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc):
        "input": input,
        "outputs": doc["outputs"],
        "question": input[0:split],
-        "text": input[split + 2:]
+        "text": input[split + 2 :],
    }


@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset):
    indices_to_keep = []
    id_to_idx = {}
    outputs = []
-    for i, (id_, output) in enumerate(zip(untokenized_dataset["id"], untokenized_dataset["output"])):
+    for i, (id_, output) in enumerate(
+        zip(untokenized_dataset["id"], untokenized_dataset["output"])
+    ):
        if id_ in id_to_idx:
            outputs[id_to_idx[id_]].append(output)
            continue
@@ -119,9 +126,11 @@ def _num_cpu_cores():
    # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
    try:
        import psutil
+
        return psutil.cpu_count(logical=False)
    except ImportError:
        import os
+
        return len(os.sched_getaffinity(0))


@@ -135,7 +144,11 @@ class _SCROLLSTask(Task):

    def __init__(self, no_metric=False):
        super().__init__()
-        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) if not no_metric else None
+        self.metric = (
+            load_metric(_download_metric(), config_name=self.DATASET_NAME)
+            if not no_metric
+            else None
+        )

    def has_training_docs(self):
        return True
@@ -176,7 +189,10 @@ class _SCROLLSTask(Task):
        that are less than `max_tokens` when tokenized by each tokenizer
        """

-        tokenizers = [AutoTokenizer.from_pretrained(tokenizer) for tokenizer in self.PRUNE_TOKENIZERS]
+        tokenizers = [
+            AutoTokenizer.from_pretrained(tokenizer)
+            for tokenizer in self.PRUNE_TOKENIZERS
+        ]
        cache = {}

        def _filter(sample):
@@ -210,18 +226,21 @@ class _SCROLLSTask(Task):
    def _make_compute_metrics(self, value):
        def compute_metrics(samples):
            predictions, references = zip(*samples)  # unzip, if you will
-            computed = self.metric.compute(predictions=predictions, references=references)
+            computed = self.metric.compute(
+                predictions=predictions, references=references
+            )
            return computed[value]
+
        return compute_metrics

    def aggregation(self):
        return {
-            key: self._make_compute_metrics(value) for key, value in self._scrolls_metrics().items()
+            key: self._make_compute_metrics(value)
+            for key, value in self._scrolls_metrics().items()
        }


 class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
-
    def __init__(self):
        super().__init__(no_metric=True)

@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
        return None

    def aggregation(self):
-        return {
-            "em": mean,
-            "acc": mean,
-            "acc_norm": mean
-        }
+        return {"em": mean, "acc": mean, "acc_norm": mean}

    def higher_is_better(self):
-        return {
-            "em": True,
-            "acc": True,
-            "acc_norm": True
-        }
+        return {"em": True, "acc": True, "acc_norm": True}

    def process_results(self, doc, results):
        gold = doc["gold"]
@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):


 class _SCROLLSSummaryTask(_SCROLLSTask):
-
    def _process_doc(self, doc):
        return [doc]

    def _scrolls_metrics(self):
-        return {"rouge1": "rouge/rouge1", "rouge2": "rouge/rouge2", "rougeL": "rouge/rougeL"}
+        return {
+            "rouge1": "rouge/rouge1",
+            "rouge2": "rouge/rouge2",
+            "rougeL": "rouge/rougeL",
+        }

    def process_results(self, doc, results):
        return {
            "rouge1": (results[0], doc["outputs"]),
            "rouge2": (results[0], doc["outputs"]),
-            "rougeL": (results[0], doc["outputs"])
+            "rougeL": (results[0], doc["outputs"]),
        }

    def construct_requests(self, doc, ctx):
-        return [rf.greedy_until(ctx, {'until': ["\n"]})]
+        return [rf.greedy_until(ctx, {"until": ["\n"]})]

    def doc_to_text(self, doc):
        return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask):

    def _process_doc(self, doc):
        doc = _process_doc_prepended_question(doc)
-        doc["is_yes_no"] = reduce(lambda prev, cur: prev and squad_metrics.normalize_answer(cur)
-                                  in ["yes", "no"], doc["outputs"], True)
+        doc["is_yes_no"] = reduce(
+            lambda prev, cur: prev
+            and squad_metrics.normalize_answer(cur) in ["yes", "no"],
+            doc["outputs"],
+            True,
+        )
        return [doc]

    def _scrolls_metrics(self):
@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask):
            prediction = "Unanswerable"
        else:
            prediction = results[0]
-        return {
-            "f1": (prediction, doc["outputs"])
-        }
+        return {"f1": (prediction, doc["outputs"])}

    def construct_requests(self, doc, ctx):
        if doc["is_yes_no"]:
@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask):
            ll_no, _ = rf.loglikelihood(ctx, " no")
            return [ll_yes, ll_no]
        else:
-            return [rf.greedy_until(ctx, {'until': ["\n"]})]
+            return [rf.greedy_until(ctx, {"until": ["\n"]})]


 class QuALITY(_SCROLLSMultipleChoiceTask):
@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
        choices_text = doc["text"][:split]

        doc["text"] = doc["text"][split:].strip()
-        doc["choices"] = [QuALITY._normalize_answer(choice) for choice in re.split(
-            QuALITY._multiple_choice_pattern, choices_text)[1:]]
+        doc["choices"] = [
+            QuALITY._normalize_answer(choice)
+            for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
+        ]
        doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))

        return [doc]
@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask):
        return self._process_doc(doc)[0]["text"]

    def process_results(self, doc, results):
-        return {
-            "f1": (results[0], doc["outputs"])
-        }
+        return {"f1": (results[0], doc["outputs"])}

    def construct_requests(self, doc, ctx):
-        return [rf.greedy_until(ctx, {'until': ["\n"]})]
+        return [rf.greedy_until(ctx, {"until": ["\n"]})]


 class ContractNLI(_SCROLLSMultipleChoiceTask):
@@ -439,5 +455,5 @@ def construct_tasks():
        "scrolls_contractnli": ContractNLI,
        "scrolls_govreport": GovReport,
        "scrolls_summscreenfd": SummScreenFD,
-        "scrolls_qmsum": QMSum
+        "scrolls_qmsum": QMSum,
    }
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -76,8 +76,16 @@ class TriviaQA(Task):
        return continuation

    def process_results(self, doc, results):
-        continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation))
-        list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in doc["answer"]["aliases"]]
+        continuation = (
+            results[0]
+            .strip()
+            .lower()
+            .translate(str.maketrans("", "", string.punctuation))
+        )
+        list_of_candidates = [
+            alias.lower().translate(str.maketrans("", "", string.punctuation))
+            for alias in doc["answer"]["aliases"]
+        ]
        return {"em": float(continuation in list_of_candidates)}

    def aggregation(self):

--- a/main.py
+++ b/main.py
@@ -12,17 +12,27 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
+    parser.add_argument(
+        "--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)
+    )
    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=str, default=None)
-    parser.add_argument("--max_batch_size", type=int, default=None,
-                        help="Maximal batch size to try with --batch_size auto")
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=float, default=None,
-                        help="Limit the number of examples per task. "
-                             "If <1, limit is a percentage of the total number of examples.")
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
@@ -77,7 +87,9 @@ def main():
    print(dumped)

    if args.output_path:
-        os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
+        dirname = os.path.dirname(args.output_path)
+        if dirname:
+            os.makedirs(dirname, exist_ok=True)
        with open(args.output_path, "w") as f:
            f.write(dumped)


--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -9,7 +9,12 @@ from lm_eval import tasks, utils


 seq2seq_models = ["google/flan-t5-small"]
-causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
+causal_models = [
+    "gpt2",
+    "facebook/opt-125m",
+    "EleutherAI/gpt-neo-125m",
+    "EleutherAI/pythia-160m",
+]
 model_names = seq2seq_models + causal_models


@@ -50,22 +55,41 @@ def eval_models(args, branch=None):
    results = {}

    for model in args.models:
-        model_type = "hf-causal-experimental" if model in causal_models \
-            else "hf-seq2seq" if model in seq2seq_models else args.model
+        model_type = (
+            "hf-causal-experimental"
+            if model in causal_models
+            else "hf-seq2seq"
+            if model in seq2seq_models
+            else args.model
+        )
        model_args = f"pretrained={model},{args.model_args}"
        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
-        tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
+        tasks = (
+            args.tasks
+            if model in causal_models or model_type == "hf-causal-experimental"
            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
+        )
        # TODO: OOM with auto for seq2seq models, also can OOM with llama
-        batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
-            else 64 if args.batch_size == "auto" else args.batch_size
-        output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
-
-        command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
-                  f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
-                  f"--batch_size {batch_size} --no_cache --output_path {output_path}"
-
-        print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
+        batch_size = (
+            args.batch_size
+            if model in causal_models or model_type == "hf-causal-experimental"
+            else 64
+            if args.batch_size == "auto"
+            else args.batch_size
+        )
+        output_path = (
+            f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+        )
+
+        command = (
+            f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
+            f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
+            f"--batch_size {batch_size} --no_cache --output_path {output_path}"
+        )
+
+        print(
+            f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
+        )

        ret = os.system(command)

@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task):
 def main():
    args = parse_args()

-    args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
+    args.branches = (
+        args.branches.split(",") if type(args.branches) == str else args.branches
+    )
    args.models = args.models.split(",") if type(args.models) == str else args.models
-    args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
-        else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
+    args.tasks = (
+        tasks.ALL_TASKS
+        if args.tasks == "all_tasks"
+        else utils.pattern_match(
+            args.tasks.split(",") if type(args.tasks) == str else args.tasks,
+            tasks.ALL_TASKS,
+        )
+    )

    global initial_branch
-    initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
+    initial_branch = (
+        subprocess.check_output("git branch --show-current", shell=True)
+        .decode("ascii")
+        .strip()
+    )

    # TODO: implement proper timing for each task
    # TODO: reduce IO by sharing tasks between models?
@@ -132,10 +168,16 @@ def main():
    print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
    print(f"|--|{'--|' * len(args.models)}")
    for task in args.tasks:
-        print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
+        print(
+            f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
+        )
        for branch, branch_results, branch_runtime in runs:
-            print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
-            print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
+            print(
+                f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
+            )
+            print(
+                f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
+            )

    print("")
    print("|branch|runtime|%|")

--- a/setup.py
+++ b/setup.py
@@ -13,9 +13,7 @@ setuptools.setup(
    long_description_content_type="text/markdown",
    url="https://github.com/EleutherAI/lm-evaluation-harness",
    packages=setuptools.find_packages(),
-    package_data={
-        "lm_eval": ["**/*.json"]
-    },
+    package_data={"lm_eval": ["**/*.json"]},
    include_package_data=True,
    classifiers=[
        "Development Status :: 3 - Alpha",