fix merge conflicts

f66fc06f · haileyschoelkopf · b13753cd · d714fc95 · f66fc06f · f66fc06f
Commit f66fc06f authored Feb 01, 2024 by haileyschoelkopf
20 changed files
--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"):
                    obs_list["abstract"].append(abstract)
                    obs_list["question"].append(question)
                    obs_list["answer_type"].append(answer_type)
-                    if type(answer) == list:
+                    if isinstance(answer, list):
                        answer = ", ".join(answer)
                    obs_list["answer"].append(answer)


--- a/lm_eval/tasks/scrolls/scrolls.yaml
+++ b/lm_eval/tasks/scrolls/scrolls.yaml
 group: scrolls
 task:
-  - scrolls_qasper
-  - scrolls_quality
-  - scrolls_narrativeqa
-  - scrolls_contractnli
-  - scrolls_govreport
-  - scrolls_summscreenfd
-  - scrolls_qmsum
+  - task: scrolls_qasper
+    class: !function task.Qasper
+  - task: scrolls_quality
+    class: !function task.QuALITY
+  - task: scrolls_narrativeqa
+    class: !function task.NarrativeQA
+  - task: scrolls_contractnli
+    class: !function task.ContractNLI
+  - task: scrolls_govreport
+    class: !function task.GovReport
+  - task: scrolls_summscreenfd
+    class: !function task.SummScreenFD
+  - task: scrolls_qmsum
+    class: !function task.QMSum
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -115,8 +115,10 @@ class _SCROLLSTask(Task):
    PRUNE_MAX_TOKENS = None
    PRUNE_NUM_PROC = None

-    def __post_init__(self):
-        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
+    def __init__(self):
+        super().__init__()
+        if self.DATASET_NAME is not None:
+            self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)

    def has_training_docs(self):
        return True
@@ -224,9 +226,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
    def process_results(self, doc, results):
        gold = doc["gold"]

-        acc = 1.0 if np.argmax(results) == gold else 0.0
+        lls, _ = zip(*results)
+        acc = 1.0 if np.argmax(lls) == gold else 0.0
        completion_len = np.array([float(len(i)) for i in doc["choices"]])
-        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+        acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0

        return {
            "acc": acc,
@@ -279,7 +282,6 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
        return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"


-@register_task("scrolls_qasper")
 class Qasper(_SCROLLSTask):
    """A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
    https://arxiv.org/abs/2105.03011
@@ -337,7 +339,6 @@ class Qasper(_SCROLLSTask):
            )


-@register_task("scrolls_quality")
 class QuALITY(_SCROLLSMultipleChoiceTask):
    """QuALITY: Question Answering with Long Input Texts, Yes!
    https://arxiv.org/abs/2112.08608
@@ -366,7 +367,6 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
        return [doc]


-@register_task("scrolls_narrativeqa")
 class NarrativeQA(_SCROLLSTask):
    """The NarrativeQA Reading Comprehension Challenge
    https://arxiv.org/abs/1712.07040
@@ -400,7 +400,6 @@ class NarrativeQA(_SCROLLSTask):
        )


-@register_task("scrolls_contractnli")
 class ContractNLI(_SCROLLSMultipleChoiceTask):
    """ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
    https://arxiv.org/abs/1712.07040
@@ -419,7 +418,6 @@ class ContractNLI(_SCROLLSMultipleChoiceTask):
        return f"{doc['text']}\n\nHypothesis: {doc['question']}\nConclusion:"


-@register_task("scrolls_govreport")
 class GovReport(_SCROLLSSummaryTask):
    """Efficient Attentions for Long Document Summarization
    https://arxiv.org/abs/2104.02112
@@ -433,7 +431,6 @@ class GovReport(_SCROLLSSummaryTask):
    DATASET_NAME = "gov_report"


-@register_task("scrolls_summscreenfd")
 class SummScreenFD(_SCROLLSSummaryTask):
    """SummScreen: A Dataset for Abstractive Screenplay Summarization
    https://arxiv.org/abs/2104.07091
@@ -442,7 +439,6 @@ class SummScreenFD(_SCROLLSSummaryTask):
    DATASET_NAME = "summ_screen_fd"


-@register_task("scrolls_qmsum")
 class QMSum(_SCROLLSSummaryTask):
    """QMSum: A New Benchmark for Query-based Multi-domain
    Meeting Summarization

--- a/lm_eval/tasks/squadv2/squadv2.yaml
+++ b/lm_eval/tasks/squadv2/squadv2.yaml
+task: squadv2
+class: !function task.SQuAD2
--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -21,7 +21,6 @@ from packaging import version

 from lm_eval.api.task import Task
 from lm_eval.api.instance import Instance
-from lm_eval.api.registry import register_task

 _CITATION = """
 @misc{rajpurkar2018know,
@@ -47,7 +46,6 @@ def _squad_agg(key, items):
    return _squad_metric(predictions=predictions, references=references).get(key, 0)


-@register_task("squadv2")
 class SQuAD2(Task):
    VERSION = 3
    DATASET_PATH = "squad_v2"

--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -7,6 +7,7 @@ training_split: train
 validation_split: validation
 output_type: generate_until
 doc_to_text: !function "t5_utils.doc_to_text"
+process_results: !function "t5_utils.process_results"
 doc_to_target: label
 generation_kwargs:
  until:
@@ -15,9 +16,5 @@ metric_list:
  - metric: accuracy
    aggregation: mean
    higher_is_better: true
-filter_list:
-  - name: "wsc_postprocessor"
-    filter:
-      - function: !function t5_utils.WSCPostprocess
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
 import re
-from lm_eval.api.filter import Filter
-
+from typing import List

 def doc_to_text(x):
    text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
@@ -24,14 +23,14 @@ def _wsc_inputs(x):
            [
                " ".join(words[:pronoun_index]),
                "X",
-                " ".join(words[pronoun_index + 1 :]),
+                " ".join(words[pronoun_index + 1:]),
            ]
        )

    # Handle some special cases.
    if (
-        x["text"]
-        == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
+            x["text"]
+            == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
    ):
        return (
            "The boy continued to whip the pony , and eventually the pony threw "
@@ -40,8 +39,8 @@ def _wsc_inputs(x):

    # Using the span2_index, we get 'use' instead of 'it'.
    if (
-        x["text"]
-        == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
+            x["text"]
+            == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
    ):
        return (
            "When they had eventually calmed down a bit , and had gotten home, "
@@ -52,56 +51,53 @@ def _wsc_inputs(x):
    return create_input()


-class WSCPostprocess(Filter):
-    def __init__(self, **kwargs):
-        self.determiners = {
-            "a",
-            "an",
-            "few",
-            "her",
-            "his",
-            "each",
-            "every",
-            "many",
-            "much",
-            "my",
-            "our",
-            "some",
-            "that",
-            "the",
-            "their",
-            "these",
-            "this",
-            "those",
-            "which",
-            "whose",
-            "your",
-        }
-
-    def clean(self, s):
-        """Ignore capitalization and determiners."""
-        s = s.strip().lower()
-        return " ".join([w for w in s.split(" ") if w not in self.determiners])
-
-    def apply(self, resps, docs):
-        filtered_resps = []
-        for prediction, reference in zip(*(resps, docs["span1_text"])):
-            prediction = self.clean(prediction[0])
-            reference = self.clean(reference)
-
-            if ("'" in prediction) != ("'" in reference):
-                # referent is "Bob's hat" as predicting the referent.
-                predicted_referent = False
-            else:
-                prediction_words = set(prediction.split(" "))
-                referent_words = set(reference.split(" "))
-
-                # Handle cases where the prediction is "fuzzy bunny" and the referent is
-                # "bunny".
-                predicted_referent = prediction_words.issubset(
-                    referent_words
-                ) or referent_words.issubset(prediction_words)
-
-            filtered_resps.append(predicted_referent)
-
-        return filtered_resps
+DETERMINERS = {
+    "a",
+    "an",
+    "few",
+    "her",
+    "his",
+    "each",
+    "every",
+    "many",
+    "much",
+    "my",
+    "our",
+    "some",
+    "that",
+    "the",
+    "their",
+    "these",
+    "this",
+    "those",
+    "which",
+    "whose",
+    "your",
+}
+
+
+def clean(s: str) -> str:
+    """Ignore capitalization and determiners."""
+    s = s.strip().lower()
+    return " ".join([w for w in s.split(" ") if w not in DETERMINERS])
+
+
+def process_results(docs: dict, resps: List):
+    prediction = clean(resps[0])
+    reference = clean(docs["span1_text"])
+
+    if ("'" in prediction) != ("'" in reference):
+        # referent is "Bob's hat" as predicting the referent.
+        predicted_referent = False
+    else:
+        prediction_words = set(prediction.split(" "))
+        referent_words = set(reference.split(" "))
+
+        # Handle cases where the prediction is "fuzzy bunny" and the referent is
+        # "bunny".
+        predicted_referent = prediction_words.issubset(
+            referent_words
+        ) or referent_words.issubset(prediction_words)
+
+    acc = 1.0 if predicted_referent == docs["label"] else 0.0
+    return {"accuracy": acc}
--- a/lm_eval/tasks/xwinograd/utils.py
+++ b/lm_eval/tasks/xwinograd/utils.py
@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
    for lang in LANGUAGES:
        file_name = f"xwinograd_{lang}.yaml"
        try:
-            with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f:
+            with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
                f.write("# Generated by utils.py\n")
                yaml.dump(
                    {

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -472,6 +472,10 @@ def get_git_commit_hash():
    return git_hash


+def ignore_constructor(loader, node):
+    return node
+
+
 def import_function(loader, node):
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)
@@ -489,11 +493,14 @@ def import_function(loader, node):
    return function


-# Add the import_function constructor to the YAML loader
-yaml.add_constructor("!function", import_function)
-
+def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None, mode="full"):
+    if mode == "simple":
+        constructor_fn = ignore_constructor
+    elif mode == "full":
+        constructor_fn = import_function

-def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
+    # Add the import_function constructor to the YAML loader
+    yaml.add_constructor("!function", constructor_fn)
    if yaml_config is None:
        with open(yaml_path, "rb") as file:
            yaml_config = yaml.full_load(file)
@@ -521,7 +528,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
                path = os.path.join(yaml_dir, path)

            try:
-                included_yaml_config = load_yaml_config(path)
+                included_yaml_config = load_yaml_config(yaml_path=path, mode=mode)
                final_yaml_config.update(included_yaml_config)
            except Exception as ex:
                # If failed to load, ignore

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "lm_eval"
-version = "0.4.0"
+version = "0.4.1"
 authors = [
    {name="EleutherAI", email="contact@eleuther.ai"}
 ]
@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 [project.optional-dependencies]
 anthropic = ["anthropic"]
 dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
-gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
+gptq = ["auto-gptq[triton]>=0.6.0"]
 ifeval = ["langdetect", "immutabledict"]
 mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
 openai = ["openai==1.3.9", "tiktoken"]
-promptsource = [
-    "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
-]
+optimum = ["optimum[openvino]"]
+promptsource = ["promptsource>=0.2.3"]
 sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm<=0.2.5"]

--- a/scripts/build_benchmark.py
+++ b/scripts/build_benchmark.py
@@ -23,7 +23,7 @@ def parse_args():
 if __name__ == "__main__":
    args = parse_args()

-    with open(args.benchmark_path) as file:
+    with open(args.benchmark_path, encoding="utf-8") as file:
        TASK_LIST = yaml.full_load(file)
        for task in tqdm(TASK_LIST):
            eval_logger.info(f"Processing {task}")
@@ -57,5 +57,5 @@ if __name__ == "__main__":

                file_save_path = os.path.join(file_path, full_file_name)
                eval_logger.info(f"Save to {file_save_path}")
-                with open(file_save_path, "w") as yaml_file:
+                with open(file_save_path, "w", encoding="utf-8") as yaml_file:
                    yaml.dump(config_dict, yaml_file)
--- a/scripts/clean_training_data/generate_13_grams.py
+++ b/scripts/clean_training_data/generate_13_grams.py
@@ -119,7 +119,7 @@ class Buckets:


 def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
-    pile_statistics = json.load(open("pile_statistics.json", "r"))
+    pile_statistics = json.load(open("pile_statistics.json", "r", encoding="utf-8"))
    pile_document_count = pile_statistics["Document Count"]
    start_offsets = pile_statistics["File Start Offsets"]

@@ -212,4 +212,4 @@ if __name__ == "__main__":

    info_dict = {"title": "dataset ngrams", "ngram_size": 13}
    info_dict_path = os.path.join(args.working_directory, "info.json")
-    json.dump(info_dict, open(info_dict_path, "w"))
+    json.dump(info_dict, open(info_dict_path, "w", encoding="utf-8"))
--- a/scripts/clean_training_data/investigate_pile.py
+++ b/scripts/clean_training_data/investigate_pile.py
@@ -79,7 +79,7 @@ if __name__ == "__main__":

    stats_file_path = "pile_statistics.json"
    if os.path.exists(stats_file_path):
-        stats = json.load(open(stats_file_path, "r"))
+        stats = json.load(open(stats_file_path, "r", encoding="utf-8"))
    else:
        document_count, total_document_size_chars, start_offsets = get_stats()
        stats = {
@@ -88,7 +88,7 @@ if __name__ == "__main__":
            "Total Pile Characters": total_document_size_chars,
            "File Start Offsets": start_offsets,
        }
-        json.dump(stats, open(stats_file_path, "w"), indent=4)
+        json.dump(stats, open(stats_file_path, "w", encoding="utf-8"), indent=4)

    print(f"document_count: {stats['Document Count']}")
    print(f"total_chars: {stats['Total Pile Characters']}")

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -61,14 +61,14 @@ if __name__ == "__main__":
        if not filenames:
            continue
        path_readme = os.path.join(dirpath, "README.md")
-        with open(path_readme, "w") as f:
+        with open(path_readme, "w", encoding="utf-8") as f:
            # get path name, only last folder
            path_name = dirpath.split("/")[-1]
            f.write(f"# {path_name} \n\n")
        for filename in sorted([f for f in filenames if f.endswith(".json")]):
            path = os.path.join(dirpath, filename)
-            with open(path, "r") as f:
+            with open(path, "r", encoding="utf-8") as f:
                result_dict = json.load(f)
-            with open(path_readme, "a") as f:
+            with open(path_readme, "a", encoding="utf-8") as f:
                f.write(f"## {filename} \n")
                f.write(f"{make_table(result_dict)} \n")
--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -11,14 +11,13 @@ import datasets
 import pandas as pd

 from lm_eval import tasks
-from lm_eval.tasks import TASK_REGISTRY
 from lm_eval.utils import load_yaml_config


 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 datasets.disable_caching()
-tasks.initialize_tasks()
+task_manager = tasks.TaskManager


 def load_changed_files(file_path: str) -> List[str]:
@@ -74,11 +73,11 @@ def maketable(df):
    ]
    values = []
    if not df:
-        _tasks = tasks.TASK_REGISTRY.items()
+        _tasks = task_manager.TASK_REGISTRY.items()
        _tasks = sorted(_tasks, key=lambda x: x[0])
    else:
        task_classes = new_tasks()
-        _tasks = [(x, TASK_REGISTRY.get(x)) for x in task_classes]
+        _tasks = [(x, task_manager.TASK_REGISTRY.get(x)) for x in task_classes]
    count = 0
    for tname, Task in _tasks:
        task = Task()

--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -94,7 +94,11 @@ def eval_models(args, branch=None):

        ret = os.system(command)

-        results[model] = json.load(open(output_path)) if ret == 0 else {"results": {}}
+        results[model] = (
+            json.load(open(output_path, encoding="utf-8"))
+            if ret == 0
+            else {"results": {}}
+        )

    end_time = time.time()


--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -5,7 +5,7 @@ import random
 import numpy as np

 from lm_eval import tasks
-from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.tasks import TaskManager
 from lm_eval.utils import eval_logger, join_iters


@@ -39,22 +39,21 @@ def main():
    args = parse_args()
    np.random.seed(args.seed)

-    initialize_tasks(args.verbosity)
-
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
-        include_path(args.include_path)
+
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path)

    if args.tasks == "all_tasks":
-        task_names = tasks.ALL_TASKS
+        task_names = task_manager.all_tasks
    else:
        task_names = args.tasks.split(",")
-    task_dict = tasks.get_task_dict(task_names)
+    task_dict = tasks.get_task_dict(task_names, task_manager)

    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
-        if type(task) == tuple:
-            group_name, task = task
+        if isinstance(task, tuple):
+            _, task = task
        rnd = random.Random()
        rnd.seed(args.seed)


--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -69,18 +69,20 @@ def main():
            model_args = re.sub(
                "/|=",
                "__",
-                json.load(open(Path(args.data_path, model, "results.json")))["config"][
-                    "model_args"
-                ],
+                json.load(
+                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                )["config"]["model_args"],
            )
            with open(
-                Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r"
+                Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
+                "r",
+                encoding="utf-8",
            ) as file:
                data = json.loads(file.read())

-            configs = json.load(open(Path(args.data_path, model, "results.json")))[
-                "configs"
-            ]
+            configs = json.load(
+                open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+            )["configs"]
            config = configs[task]

            if model_index == 0:  # Only need to assemble data for the first model
@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
        list: A list of tasks for the model.
    """
    dir_path = Path(data_path, model)
-    config = (json.load(open(Path(dir_path, "results.json")))["configs"],)
+    config = (
+        json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
+    )
    return list(config[0].keys())



--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance
 from lm_eval.models.huggingface import HFLM


-tasks.initialize_tasks()
+task_manager = tasks.TaskManager()


 class Test_HFLM:
    torch.use_deterministic_algorithms(True)
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
    version_minor = sys.version_info.minor
-    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
+    generate_until_task = task_list["gsm8k"]  # type: ignore
    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
    generate_until: list[Instance] = generate_until_task.instances
-    rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
+    rolling_task = task_list["wikitext"]  # type: ignore
    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
    ROLLING: list[Instance] = rolling_task.instances


--- a/tests/models/test_openvino.py
+++ b/tests/models/test_openvino.py
+import random
+import tempfile
+
+import pytest
+from optimum.intel import OVModelForCausalLM
+from transformers import AutoTokenizer
+
+import lm_eval.evaluator as evaluator
+from lm_eval.api.registry import get_model
+
+
+SUPPORTED_ARCHITECTURES_TASKS = {
+    "facebook/opt-125m": "lambada_openai",
+    "hf-internal-testing/tiny-random-gpt2": "wikitext",
+}
+
+
+@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
+def test_evaluator(model_id, task):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        model = OVModelForCausalLM.from_pretrained(
+            model_id, export=True, use_cache=True
+        )
+        model.save_pretrained(tmpdirname)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(tmpdirname)
+
+        lm = get_model("openvino").create_from_arg_string(
+            f"pretrained={tmpdirname}",
+            {
+                "batch_size": 1,
+                "device": "cpu",
+            },
+        )
+
+        def ll_fn(reqs):
+            for ctx, cont in [req.args for req in reqs]:
+                if len(ctx) == 0:
+                    continue
+                # space convention
+                assert ctx[-1] != " "
+                assert cont[0] == " " or ctx[-1] == "\n"
+
+            res = []
+
+            random.seed(42)
+            for _ in reqs:
+                res.append((-random.random(), False))
+
+            return res
+
+        def ll_perp_fn(reqs):
+            for (string,) in [req.args for req in reqs]:
+                assert isinstance(string, str)
+
+            res = []
+            random.seed(42)
+            for _ in reqs:
+                res.append(-random.random())
+
+            return res
+
+        lm.loglikelihood = ll_fn
+        lm.loglikelihood_rolling = ll_perp_fn
+
+        limit = 10
+        evaluator.simple_evaluate(
+            model=lm,
+            tasks=[task],
+            num_fewshot=0,
+            limit=limit,
+            bootstrap_iters=10,
+        )