AUTO_MODEL_CLASS=transformers.AutoModelForVision2Seq# TODO: what's the right way to handle this. maybe phase out the direct class-equality checks in HFLM?
def_create_tokenizer(
self,
...
...
@@ -94,6 +36,9 @@ class HFMultimodalLM(HFLM):
)->None:
"""
Helper method during initialization.
For the multimodal variant, we initialize not just
`self.tokenizer` but also `self.processor`.
"""
iftokenizer:
...
...
@@ -201,10 +146,10 @@ class HFMultimodalLM(HFLM):
gen_kwargs["max_new_tokens"]=1024
if"temperature"notingen_kwargs:
gen_kwargs["temperature"]=0
# if "top_p" not in gen_kwargs:
# gen_kwargs["top_p"] = None
# if "num_beams" not in gen_kwargs:
# gen_kwargs["num_beams"] = 1
if"top_p"notingen_kwargs:
gen_kwargs["top_p"]=None
if"num_beams"notingen_kwargs:
gen_kwargs["num_beams"]=1
stopping_criteria=stop_sequences_criteria(
self.tokenizer,
...
...
@@ -319,12 +264,14 @@ class HFMultimodalLM(HFLM):
print(f"Prompt:\n\n{contexts}\n")
self.tokenizer.padding_side="left"
inputs=self.processor(
inputs=self.processor(# TODO: write this as tok_batch_encode (and allow that to either take a visuals value or None)