Merge pull request #994 from AlexKoff88/ak/openvino_integration

Added support of OpenVINO inference

Merge pull request #994 from AlexKoff88/ak/openvino_integration
Added support of OpenVINO inference
33a215c7 · Hailey Schoelkopf · GitHub · c1bd72c7 · a97a5a9e · 33a215c7
Unverified Commit 33a215c7 authored Nov 22, 2023 by Hailey Schoelkopf Committed by GitHub Nov 22, 2023
6 changed files
--- a/README.md
+++ b/README.md
@@ -81,6 +81,11 @@ To evaluate models that are loaded via `AutoSeq2SeqLM` in Huggingface, you inste
 > **Warning**: Choosing the wrong model may result in erroneous outputs despite not erroring.
+### OpenVINO models converted via HuggingFace Optimum
+```bash
+python main.py --model optimum-causal --model_args pretrained=<model_path_or_name> --task lambada_openai
+```
 ### Commercial APIs
 Our library also supports language models served via the OpenAI API:

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -16,7 +16,8 @@ MODEL_REGISTRY = {
    "anthropic": anthropic_llms.AnthropicLM,
    "textsynth": textsynth.TextSynthLM,
    "dummy": dummy.DummyLM,
-    "gguf": gguf.GGUFLM
+    "gguf": gguf.GGUFLM,
+    "optimum-causal": gpt2.OPTIMUMLM,
 }

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -175,3 +175,116 @@ class HFLM(BaseLM):
 # for backwards compatibility
 GPT2LM = HFLM
+class OPTIMUMLM(BaseLM):
+    def __init__(
+        self,
+        device="cpu",
+        pretrained="gpt2",
+        revision="main",
+        low_cpu_mem_usage=None,
+        subfolder=None,
+        tokenizer=None,
+        batch_size=1,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
+    ):
+        super().__init__()
+        import optimum
+        from optimum.intel.openvino import OVModelForCausalLM
+        assert isinstance(device, str)
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int,str))
+        device_list = set(["cuda", "cpu"] + [f'cuda:{i}' for i in range(torch.cuda.device_count())])
+        if device and device in device_list:
+            self._device = torch.device(device)
+            print(f"Using device '{device}'")
+        else:
+            print("Device not specified")
+            print(f"Cuda Available? {torch.cuda.is_available()}")
+            self._device = (
+                torch.device("cuda")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
+        revision = revision + ("/" + subfolder if subfolder is not None else "")
+        self.gpt2 = OVModelForCausalLM.from_pretrained(
+            pretrained,
+            load_in_8bit=load_in_8bit,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_cache=True,
+        )
+        try:
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                pretrained if tokenizer is None else tokenizer,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+            )
+        except:
+            print("Tokenizer is missed. Plaase save it into the same folder with the model.")
+        self.vocab_size = self.tokenizer.vocab_size
+        # setup for automatic batch size detection
+        if batch_size == 'auto': 
+            self.batch_size_per_gpu = batch_size
+        else:
+            self.batch_size_per_gpu = int(batch_size) 
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self):
+        try:
+            return self.gpt2.config.n_ctx
+        except AttributeError:
+            # gptneoconfig doesn't have n_ctx apparently
+            return self.gpt2.config.max_position_embeddings
+    @property
+    def max_gen_toks(self):
+        return 256
+    @property
+    def batch_size(self):
+        # TODO: fix multi-gpu
+        return self.batch_size_per_gpu  # * gpus
+    @property
+    def device(self):
+        # TODO: fix multi-gpu
+        return self._device
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+    def _model_call(self, inps):
+        """
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model
+        """
+        return self.gpt2(inps)[0]
+    def _model_generate(self, context, max_length, eos_token_id):
+        generation_kwargs = {'do_sample': False, 'max_length': max_length}
+        if eos_token_id is not None:
+            generation_kwargs['eos_token_id'] = eos_token_id
+            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
+        return self.gpt2.generate(context, **generation_kwargs)
--- a/lm_eval/tasks/bigbench.py
+++ b/lm_eval/tasks/bigbench.py
@@ -10,6 +10,7 @@ import functools
 import numpy as np
 import re
 import importlib.resources
+import importlib_resources
 from lm_eval.base import rf, Task
 from lm_eval.metrics import mean

--- a/setup.py
+++ b/setup.py
@@ -50,5 +50,6 @@ setuptools.setup(
        "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
        "auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
        "anthropic": ["anthropic"],
+        "openvino": ["openvino", "nncf", "onnx", "optimum-intel @ git+https://github.com/huggingface/optimum-intel.git"],
    },
 )
--- a/tests/test_openvino.py
+++ b/tests/test_openvino.py
+import os
+import tempfile
+import lm_eval.base as base
+import lm_eval.tasks as tasks
+import lm_eval.models as models
+import lm_eval.evaluator as evaluator
+import random
+import pytest
+from transformers import AutoTokenizer
+from optimum.intel import OVModelForCausalLM
+SUPPORTED_ARCHITECTURES_TASKS = {
+        "facebook/opt-125m": "lambada_openai",
+        "hf-internal-testing/tiny-random-gpt2": "wikitext"
+}
+@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
+def test_evaluator(model_id, task):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True)
+        model.save_pretrained(tmpdirname)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(tmpdirname)
+        lm = models.get_model("optimum-causal").create_from_arg_string(
+                f"pretrained={tmpdirname}",
+                {
+                    "batch_size": 1,
+                    "device": "cpu",
+                },
+            )
+        task_dict = tasks.get_task_dict([task])
+        def ll_fn(reqs):
+            for ctx, cont in reqs:
+                if len(ctx) == 0:
+                    continue
+                # space convention
+                assert ctx[-1] != " "
+                assert cont[0] == " " or ctx[-1] == "\n"
+            res = []
+            random.seed(42)
+            for _ in reqs:
+                res.append((-random.random(), False))
+            return res
+        def ll_perp_fn(reqs):
+            for (string,) in reqs:
+                assert isinstance(string, str)
+            res = []
+            random.seed(42)
+            for _ in reqs:
+                res.append(-random.random())
+            return res
+        lm.loglikelihood = ll_fn
+        lm.loglikelihood_rolling = ll_perp_fn
+        limit = 10
+        evaluator.evaluate(
+            lm=lm,
+            task_dict=task_dict,
+            num_fewshot=0,
+            limit=limit,
+            bootstrap_iters=10,
+            description_dict=None,
+        )
\ No newline at end of file