resolved merge conflict

8d59330b · lintangsutawika · 110e5a28 · d4a913c4 · 8d59330b · 8d59330b
Commit 8d59330b authored May 07, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
+import json
 from importlib.util import find_spec
 from pathlib import Path

+from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM


+eval_logger = utils.eval_logger
+
+
 @register_model("openvino")
 class OptimumLM(HFLM):
    """
    Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
    OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
    Intel® architectures using OpenVINO™ runtime.
+
+    To use an OpenVINO config, use `--model_args ov_config` to point to a json file with an OpenVINO config:
+    `lm_eval --model openvino --model_args pretrained=gpt2,ov_config=config.json --task lambada_openai`
+    Example json file contents: {"INFERENCE_PRECISION_HINT": "f32", "CACHE_DIR": "model_cache"}
    """

    def __init__(
@@ -48,16 +57,25 @@ class OptimumLM(HFLM):
            from optimum.intel.openvino import OVModelForCausalLM

        model_kwargs = kwargs if kwargs else {}
+        if "ov_config" in model_kwargs:
+            if not Path(model_kwargs["ov_config"]).exists():
+                raise ValueError(
+                    "ov_config should point to a .json file containing an OpenVINO config"
+                )
+            with open(model_kwargs["ov_config"]) as f:
+                model_kwargs["ov_config"] = json.load(f)
+                eval_logger.info(
+                    f"Using custom OpenVINO config: {model_kwargs['ov_config']}"
+                )
+
+        else:
+            model_kwargs["ov_config"] = {}
+        model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
        model_file = Path(pretrained) / "openvino_model.xml"
        if model_file.exists():
            export = False
        else:
            export = True
-        kwargs["ov_config"] = {
-            "PERFORMANCE_HINT": "LATENCY",
-            "NUM_STREAMS": "1",
-            "CACHE_DIR": "",
-        }

        self._model = OVModelForCausalLM.from_pretrained(
            pretrained,

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -21,10 +21,14 @@ from lm_eval.utils import (
 try:
    import ray
    from vllm import LLM, SamplingParams
+
+    if parse_version(version("vllm")) > parse_version("0.3.0"):
+        from vllm.lora.request import LoRARequest
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
    pass

+
 eval_logger = eval_logger


@@ -34,7 +38,7 @@ class VLLM(TemplateLM):

    def __init__(
        self,
-        pretrained="gpt2",
+        pretrained: str,
        dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
        revision: Optional[str] = None,
        trust_remote_code: Optional[bool] = False,
@@ -55,6 +59,7 @@ class VLLM(TemplateLM):
        gpu_memory_utilization: float = 0.9,
        device: str = "cuda",
        data_parallel_size: int = 1,
+        lora_local_path: str = None,
        **kwargs,
    ):
        super().__init__()
@@ -127,6 +132,14 @@ class VLLM(TemplateLM):

        self._max_gen_toks = max_gen_toks

+        if lora_local_path is not None:
+            assert parse_version(version("vllm")) > parse_version(
+                "0.3.0"
+            ), "lora adapters only compatible with vllm > v0.3.0."
+            self.lora_request = LoRARequest("finetuned", 1, lora_local_path)
+        else:
+            self.lora_request = None
+
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
@@ -223,6 +236,14 @@ class VLLM(TemplateLM):
            # flatten results
            return undistribute(results)

+        if self.lora_request is not None:
+            outputs = self.model.generate(
+                prompt_token_ids=requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+                lora_request=self.lora_request,
+            )
+        else:
            outputs = self.model.generate(
                prompt_token_ids=requests,
                sampling_params=sampling_params,

--- a/lm_eval/tasks/fda/README.md
+++ b/lm_eval/tasks/fda/README.md
+# FDA
+
+### Paper
+
+Title: Language Models Enable Simple Systems For
+Generating Structured Views Of Heterogenous Data
+Lakes
+
+Abstract: A long standing goal of the data management community is to develop general, automated systems
+that ingest semi-structured documents and output queryable tables without human effort or domain
+specific customization. Given the sheer variety of potential documents, state-of-the art systems make
+simplifying assumptions and use domain specific training. In this work, we ask whether we can
+maintain generality by using large language models (LLMs). LLMs, which are pretrained on broad
+data, can perform diverse downstream tasks simply conditioned on natural language task descriptions.
+We propose and evaluate EVAPORATE, a simple, prototype system powered by LLMs. We identify
+two fundamentally different strategies for implementing this system: prompt the LLM to directly
+extract values from documents or prompt the LLM to synthesize code that performs the extraction.
+Our evaluations show a cost-quality tradeoff between these two approaches. Code synthesis is cheap,
+but far less accurate than directly processing each document with the LLM. To improve quality while
+maintaining low cost, we propose an extended code synthesis implementation, EVAPORATE-CODE+,
+which achieves better quality than direct extraction. Our key insight is to generate many candidate
+functions and ensemble their extractions using weak supervision. EVAPORATE-CODE+ not only
+outperforms the state-of-the art systems, but does so using a sublinear pass over the documents with
+the LLM. This equates to a 110× reduction in the number of tokens the LLM needs to process,
+averaged across 16 real-world evaluation settings of 10k documents each.
+
+
+A task for LMs to perform Information Extraction, as implemented by Based.
+
+Homepage: https://github.com/HazyResearch/based-evaluation-harness
+
+
+Description:
+> FDA (Information Extraction). The task is to extract key-value pairs from a set of PDFs scraped from the FDA website. We use the dataset and labels collected in Arora et al. 2023. We break apart the documents into chunks of 1,920 tokens. For every key-value pair that appears in the chunk, we create a zero-shot prompt using the simple prompt template: {chunk} \n {key}: We allow the model to generate a fixed number of tokens after the prompt and check (with case insensitivity) if the value is contained within the generation. We report accuracy, the fraction of prompts for which the generation contains the value.
+
+
+
+### Citation
+
+```
+@misc{arora2024simple,
+      title={Simple linear attention language models balance the recall-throughput tradeoff},
+      author={Simran Arora and Sabri Eyuboglu and Michael Zhang and Aman Timalsina and Silas Alberti and Dylan Zinsley and James Zou and Atri Rudra and Christopher Ré},
+      year={2024},
+      eprint={2402.18668},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{arora2023language,
+      title={Language Models Enable Simple Systems for Generating Structured Views of Heterogeneous Data Lakes},
+      author={Simran Arora and Brandon Yang and Sabri Eyuboglu and Avanika Narayan and Andrew Hojel and Immanuel Trummer and Christopher Ré},
+      year={2023},
+      eprint={2304.09433},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+```
+
+### Groups and Tasks
+
+#### Tasks
+
+* `fda`: the FDA task as implemented in the paper "Simple linear attention language models balance the recall-throughput tradeoff". Designed for zero-shot evaluation of small LMs.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/fda/fda.yaml
+++ b/lm_eval/tasks/fda/fda.yaml
+task: fda
+class: !function task.FDA
--- a/lm_eval/tasks/fda/task.py
+++ b/lm_eval/tasks/fda/task.py
+"""
+"""
+import re
+from typing import List
+
+import numpy as np
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.task import ConfigurableTask
+
+
+class FDA(ConfigurableTask):
+    VERSION = 0
+    DATASET_PATH = "hazyresearch/based-fda"
+    DATASET_NAME = "default"
+
+    def __init__(self):
+        super().__init__(config={"metadata": {"version": self.VERSION}})
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return doc["text"]
+
+    def doc_to_target(self, doc):
+        return doc["value"]
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+
+        return [
+            Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"], "max_gen_toks": 48}),
+                idx=0,
+                **kwargs,
+            )
+        ]
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # continuation, (logprob_unanswerable, _) = results
+        continuation = results
+
+        return {"contains": contains_score(continuation[0], [doc["value"]])}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "contains": np.mean,  # Exact match (the normalized answer exactly match the gold answer)
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "contains": True,  # Exact match (the normalized answer exactly match the gold answer
+        }
+
+
+def contains_score(prediction: str, labels: List[str]):
+    return max(
+        int(bool(re.search(re.compile(re.escape(label), re.IGNORECASE), prediction)))
+        for label in labels
+    )
--- a/lm_eval/tasks/hendrycks_math/README.md
+++ b/lm_eval/tasks/hendrycks_math/README.md
+# MATH
+
+## Paper
+Measuring Mathematical Problem Solving With the MATH Dataset
+https://arxiv.org/abs/2103.03874
+
+Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
+
+NOTE: This task corresponds to the MATH (`hendrycks_math`) implementation at https://github.com/EleutherAI/lm-evaluation-harness/tree/master . For the variant which uses the custom 4-shot prompt in the Minerva paper (https://arxiv.org/abs/2206.14858), and SymPy answer checking as done by Minerva, see `lm_eval/tasks/minerva_math`.
+
+Homepage: https://github.com/hendrycks/math
+
+
+## Citation
+```
+@article{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `hendrycks_math`: the MATH benchmark from Hendrycks et al. 0- or few-shot.
+
+#### Tasks
+
+- `hendrycks_math_algebra`
+- `hendrycks_math_counting_and_prob`
+- `hendrycks_math_geometry`
+- `hendrycks_math_intermediate_algebra`
+- `hendrycks_math_num_theory`
+- `hendrycks_math_prealgebra`
+- `hendrycks_math_precalc`
+
+### Checklist
+
+The checklist is the following:
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+    * Answer extraction code is taken from the original MATH benchmark paper's repository.
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
+group: hendrycks_math
+task:
+  - hendrycks_math_algebra
+  - hendrycks_math_counting_and_prob
+  - hendrycks_math_geometry
+  - hendrycks_math_intermediate_algebra
+  - hendrycks_math_num_theory
+  - hendrycks_math_prealgebra
+  - hendrycks_math_precalc
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+group:
+  - math_word_problems
+task: hendrycks_math_algebra
+dataset_path: EleutherAI/hendrycks_math
+process_docs: !function utils.process_docs
+dataset_name: algebra
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_text:  "Problem: {{problem}}\nAnswer:"
+process_results: !function utils.process_results
+doc_to_target: "{{answer}}"
+generation_kwargs:
+  until:
+    - "Problem:"
+  do_sample: false
+  temperature: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml
+include: hendrycks_math_algebra.yaml
+dataset_name: counting_and_probability
+task: hendrycks_math_counting_and_prob
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml
+include: hendrycks_math_algebra.yaml
+dataset_name: geometry
+task: hendrycks_math_geometry
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml
+include: hendrycks_math_algebra.yaml
+dataset_name: intermediate_algebra
+task: hendrycks_math_intermediate_algebra
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml
+include: hendrycks_math_algebra.yaml
+dataset_name: number_theory
+task: hendrycks_math_num_theory
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml
+include: hendrycks_math_algebra.yaml
+dataset_name: prealgebra
+task: hendrycks_math_prealgebra
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml
+include: hendrycks_math_algebra.yaml
+dataset_name: precalculus
+task: hendrycks_math_precalc
--- a/lm_eval/tasks/hendrycks_math/utils.py
+++ b/lm_eval/tasks/hendrycks_math/utils.py
+from typing import Dict, List
+
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["problem"],
+            "solution": doc["solution"],
+            "answer": remove_boxed(last_boxed_only_string(doc["solution"])),
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    retval = 0
+    indices = [pos for pos, char in enumerate(results[0]) if char == "$"]
+    if len(indices) <= 1:
+        answer = results[0]
+    else:
+        answer = results[0][indices[0] + 1 : indices[-1]]
+
+    if is_equiv(answer, remove_boxed(last_boxed_only_string(doc["solution"]))):
+        retval = 1
+
+    results = {
+        "exact_match": retval,
+    }
+    return results
+
+
+# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
+def is_equiv(str1, str2, verbose=False):
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return True
+    if str1 is None or str2 is None:
+        return False
+
+    try:
+        ss1 = strip_string(str1)
+        ss2 = strip_string(str2)
+        if verbose:
+            print(ss1, ss2)
+        return ss1 == ss2
+    except Exception:
+        return str1 == str2
+
+
+def remove_boxed(s):
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+
+    left = "\\boxed{"
+
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left) : -1]
+
+
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except AssertionError:
+        return string
+
+
+def remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+
+
+def fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+
+
+def strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")  # noqa: W605
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = fix_a_slash_b(string)
+
+    return string
--- a/lm_eval/tasks/ifeval/ifeval.yaml
+++ b/lm_eval/tasks/ifeval/ifeval.yaml
@@ -12,7 +12,6 @@ generation_kwargs:
  temperature: 0.0
  max_gen_toks: 1280
 process_results: !function utils.process_results
-num_fewshot: 0
 metric_list:
  - metric: prompt_level_strict_acc
    aggregation: mean

--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
@@ -28,16 +28,11 @@ Eprint = {arXiv:2206.14858},
 }
 ```

-### Groups, Benchmarks and Tasks
-
-#### Benchmarks
-
- `minerva_math`
+### Groups and Tasks

 #### Groups

- `math_word_problems`
- `generate_until`
+- `minerva_math`

 #### Tasks


--- a/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
@@ -20,4 +20,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/okapi/arc_multilingual/utils.py
+++ b/lm_eval/tasks/okapi/arc_multilingual/utils.py
@@ -4,8 +4,6 @@ import datasets


 def preprocess(text):
-    if text is None:
-        return " "
    text = text.strip()
    text = text.replace(" [title]", ". ")
    text = re.sub("\\[.*?\\]", "", text)
@@ -20,11 +18,15 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
            "id": doc["id"],
            "query": "Question: " + preprocess(doc["instruction"]) + "\nAnswer:",
            "choices": [
-                preprocess(doc["option_a"]),
-                preprocess(doc["option_b"]),
-                preprocess(doc["option_c"]),
-                preprocess(doc["option_d"]),
-                preprocess(doc["option_e"]),
+                preprocess(option)
+                for option in [
+                    doc["option_a"],
+                    doc["option_b"],
+                    doc["option_c"],
+                    doc["option_d"],
+                    doc["option_e"],
+                ]
+                if option
            ],
            "gold": ["A", "B", "C", "D", "E"].index(doc["answer"]),
        }

--- a/lm_eval/tasks/pile_10k/README.md
+++ b/lm_eval/tasks/pile_10k/README.md
+# Pile-10k
+
+### Paper
+
+Title: `NeelNanda/pile-10k`
+
+Abstract: The first 10K elements of [The Pile](https://pile.eleuther.ai/), useful for debugging models trained on it. See the [HuggingFace page for the full Pile](https://huggingface.co/datasets/the_pile) for more info. Inspired by [stas' great resource](https://huggingface.co/datasets/stas/openwebtext-10k) doing the same for OpenWebText
+
+Homepage: [https://huggingface.co/datasets/NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k)
+
+
+### Citation
+
+```
+@misc{Nanda2022Pile10K,
+  author = {Nanda, Neel},
+  title = {{NeelNanda/pile-10k} \textendash\ Datasets at Hugging Face},
+  year = {2022},
+  howpublished = {\url{https://huggingface.co/datasets/NeelNanda/pile-10k}},
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+
+#### Tasks
+
+* `pile_10k`: `The first 10K elements of The Pile, useful for debugging models trained on it.`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?