add WIP hf vlm class

5b62a529 · haileyschoelkopf · 67a990e7 · 5b62a529 · 5b62a529 · 5b62a529
Commit 5b62a529 authored Jul 02, 2024 by haileyschoelkopf
Showing with 334 additions and 3 deletions

lm_eval/models/__init__.py lm_eval/models/__init__.py +1 -0

lm_eval/models/hf_vlms.py lm_eval/models/hf_vlms.py +300 -0

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +33 -3

No files found.
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -2,6 +2,7 @@ from . import (
    anthropic_llms,
    dummy,
    gguf,
+    hf_vlms,
    huggingface,
    mamba_lm,
    nemo_lm,

--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
+import copy
+from typing import List, Optional, Tuple, Union
+import transformers
+from tqdm import tqdm
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+from lm_eval.models.utils import Collator, stop_sequences_criteria
+@register_model("hf-multimodal")
+class HFMultimodalLM(HFLM):
+    """
+    An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
+    """
+    AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq  # TODO: what's the right way to handle this. maybe phase out the direct class-equality checks in HFLM?
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.ProcessorMixin,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Helper method during initialization.
+        For the multimodal variant, we initialize not just
+        `self.tokenizer` but also `self.processor`.
+        """
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                return transformers.AutoProcessor.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    # use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                return tokenizer
+        # Get tokenizer based on 'pretrained'
+        if isinstance(pretrained, str):
+            model_name = pretrained
+        else:
+            # get the HF hub name via accessor on model
+            model_name = self.model.name_or_path
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            model_name,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            # use_fast=use_fast_tokenizer,
+        )
+        self.tokenizer = self.processor.tokenizer
+    # def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    #     """
+    #     Method to apply a chat template to a list of chat history between user and model.
+    #     """
+    #     return self.tokenizer.apply_chat_template(
+    #         chat_history, tokenize=False, add_generation_prompt=True
+    #     )
+    # def tok_encode(
+    #     self, string: str, left_truncate_len=None, add_special_tokens=None
+    # ) -> List[int]:
+    #     """ """
+    #     # default for None - empty dict, use predefined tokenizer param
+    #     # used for all models except for CausalLM or predefined value
+    #     special_tokens_kwargs = {}
+    #     # by default for CausalLM - false or self.add_bos_token is set
+    #     if add_special_tokens is None:
+    #         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+    #             special_tokens_kwargs = {
+    #                 "add_special_tokens": False or self.add_bos_token
+    #             }
+    #     # otherwise the method explicitly defines the value
+    #     else:
+    #         special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+    #     encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+    #     # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+    #     if left_truncate_len:
+    #         encoding = encoding[-left_truncate_len:]
+    #     return encoding
+    # def tok_batch_encode(
+    #     self,
+    #     strings: List[str],
+    #     padding_side: str = "left",
+    #     left_truncate_len: int = None,
+    #     truncation: bool = False,
+    # ) -> Tuple[torch.Tensor, torch.Tensor]:
+    #     # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+    #     old_padding_side = self.tokenizer.padding_side
+    #     self.tokenizer.padding_side = padding_side
+    #     add_special_tokens = {}
+    #     if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+    #         add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+    #     encoding = self.tokenizer(
+    #         strings,
+    #         truncation=truncation,
+    #         padding="longest",
+    #         return_tensors="pt",
+    #         **add_special_tokens,
+    #     )
+    #     if left_truncate_len:
+    #         encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+    #         encoding["attention_mask"] = encoding["attention_mask"][
+    #             :, -left_truncate_len:
+    #         ]
+    #     self.tokenizer.padding_side = old_padding_side
+    #     return encoding["input_ids"], encoding["attention_mask"]
+    # def tok_decode(self, tokens, skip_special_tokens=True):
+    #     return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    def _model_generate(self, inputs, stop, **gen_kwargs):
+        # TODO: handle max_length
+        # gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+        if "max_new_tokens" not in gen_kwargs:
+            gen_kwargs["max_new_tokens"] = 1024
+        if "temperature" not in gen_kwargs:
+            gen_kwargs["temperature"] = 0
+        if "top_p" not in gen_kwargs:
+            gen_kwargs["top_p"] = None
+        if "num_beams" not in gen_kwargs:
+            gen_kwargs["num_beams"] = 1
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer,
+            stop,
+            inputs["input_ids"].shape[1],
+            inputs["input_ids"].shape[0],
+        )
+        return self.model.generate(
+            **inputs,
+            # max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+            use_cache=True,
+            pad_token_id=self.tokenizer.eos_token_id,
+        )
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError(
+            "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks"
+        )
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        raise NotImplementedError(
+            "model type `hf-multimodal` does not support loglikelihood or multiple choice. Use 'hf' model type for text-only loglikelihood tasks"
+        )
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests with text+image input",
+        )
+        # TODO: port auto-batch sizing into this.
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        ### Up to here: was identical to non-multimodal HFLM generate_until ###
+        for chunk in chunks:
+            contexts, all_gen_kwargs, doc_to_visual, doc = zip(
+                *chunk
+            )  # TODO: what should be passed in here as part of a chunk?
+            visuals = [
+                vis(d) for vis, d in zip(doc_to_visual, doc)
+            ]  # TODO: I think *fully* flattening is just wrong for bs>1 ?
+            ### this part onward: same as HFLM ###
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+            ### end stuff that's entirely copied verbatim from HFLM ###
+            max_ctx_len = self.max_length - max_gen_toks  # noqa: F841 # TODO: this assumes we are using a causal LM. is that always valid? shouldn't be
+            self.tokenizer.padding_side = "left"
+            inputs = self.processor(  # TODO: write this as tok_batch_encode (and allow that to either take a visuals value or None)
+                images=visuals, text=contexts, return_tensors="pt", padding=True
+            ).to(
+                self.device, self.model.dtype
+            )  # TODO: factor out into a tok_batch_encode bit ; truncate from left using max_ctx_len
+            print(inputs)
+            context_enc = inputs["input_ids"]
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+            cont = self._model_generate(inputs, stop=until, **gen_kwargs)
+            ### essentially same as HFLM beyond this line!
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                # if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: # TODO: ensure this holds for VLMs
+                cont_toks = cont_toks[context_enc.shape[1] :]
+                s = self.tok_decode(cont_toks)
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+                res.append(s)
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+        pbar.close()
+        return res
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -30,7 +30,6 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
    clear_torch_cache,
-    configure_pad_token,
    get_dtype,
    pad_and_concat,
    stop_sequences_criteria,
@@ -254,10 +253,32 @@ class HFLM(TemplateLM):
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
        # select (or create) a pad token to use
-        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+        if self.tokenizer.pad_token:
+            pass
+        elif self.tokenizer.unk_token:
+            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
+        elif self.tokenizer.eos_token:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        else:
+            if getattr(self.config, "model_type", None) == "qwen":
+                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
+            else:
+                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+        # TODO: override this for Gemma
        self.add_bos_token = add_bos_token
-        if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]:
+        if getattr(self.config, "model_type", None) == "gemma":
            self.add_bos_token = True
            eval_logger.info(
                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
@@ -418,7 +439,16 @@ class HFLM(TemplateLM):
        Helper method during initialization.
        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
        model type to be used.
+        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
        """
+        # escape hatch: if we're using a subclass that shouldn't follow
+        # the default _get_backend logic,
+        # then skip over the method.
+        # TODO: this seems very much undesirable in some cases--our code in HFLM
+        # references AutoModelForCausalLM at times to check for equality
+        if self.AUTO_MODEL_CLASS is not None:
+            return
        assert backend in ["default", "causal", "seq2seq"]
        if backend != "default":