Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
6e3b2ea1
Commit
6e3b2ea1
authored
Jun 28, 2024
by
haileyschoelkopf
Browse files
WIP code
parent
7d7a3a1c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
344 additions
and
0 deletions
+344
-0
lm_eval/models/__init__.py
lm_eval/models/__init__.py
+1
-0
lm_eval/models/hf_vlms.py
lm_eval/models/hf_vlms.py
+336
-0
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+7
-0
No files found.
lm_eval/models/__init__.py
View file @
6e3b2ea1
...
...
@@ -2,6 +2,7 @@ from . import (
anthropic_llms
,
dummy
,
gguf
,
hf_vlms
,
huggingface
,
llava
,
mamba_lm
,
...
...
lm_eval/models/hf_vlms.py
0 → 100644
View file @
6e3b2ea1
from
typing
import
List
,
Optional
,
Tuple
,
Union
import
transformers
from
tqdm
import
tqdm
from
transformers
import
AutoModelForVision2Seq
from
lm_eval
import
utils
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.registry
import
register_model
from
lm_eval.models.huggingface
import
HFLM
DEFAULT_IMAGE_TOKEN
=
"<image>"
@
register_model
(
"hf-multimodal"
)
class
HFMultimodalLM
(
HFLM
):
"""
An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
"""
AUTO_MODEL_CLASS
=
AutoModelForVision2Seq
@
property
def
max_length
(
self
):
raise
NotImplementedError
@
property
def
tokenizer_name
(
self
)
->
str
:
return
self
.
processor
.
tokenizer
.
name_or_path
.
replace
(
"/"
,
"__"
)
@
property
def
chat_template
(
self
)
->
str
:
if
self
.
processor
.
tokenizer
.
chat_template
is
not
None
:
return
self
.
processor
.
tokenizer
.
chat_template
return
self
.
processor
.
tokenizer
.
default_chat_template
def
_get_config
(
self
,
pretrained
:
str
,
revision
:
str
=
"main"
,
trust_remote_code
:
bool
=
False
,
)
->
None
:
self
.
_config
=
transformers
.
AutoConfig
.
from_pretrained
(
pretrained
,
revision
=
revision
,
trust_remote_code
=
trust_remote_code
,
)
# def _create_model(
# self,
# pretrained: Union[str, transformers.PreTrainedModel],
# revision="main",
# dtype="auto",
# trust_remote_code=False,
# **kwargs,
# ) -> None:
# """
# Initializes an HF or HF-compatible PreTrainedModel from scratch
# inside HFLM, using the kwargs passed into self.__init__().
# """
# model_kwargs = kwargs if kwargs else {}
# if parallelize:
# # do stuff
# pass
# if isinstance(pretrained, str):
# return self.AUTO_MODEL_CLASS.from_pretrained(
# pretrained,
# revision=revision,
# torch_dtype=get_dtype(dtype),
# trust_remote_code=trust_remote_code,
# **model_kwargs,
# )
# assert isinstance(pretrained, transformers.PreTrainedModel)
# return pretrained
def
_create_tokenizer
(
self
,
pretrained
:
Union
[
str
,
transformers
.
PreTrainedModel
],
tokenizer
:
Optional
[
Union
[
str
,
transformers
.
ProcessorMixin
,
]
],
revision
:
Optional
[
str
]
=
"main"
,
trust_remote_code
:
Optional
[
bool
]
=
False
,
**
kwargs
,
)
->
None
:
"""
Helper method during initialization.
"""
if
tokenizer
:
if
isinstance
(
tokenizer
,
str
):
return
transformers
.
AutoProcessor
.
from_pretrained
(
tokenizer
,
revision
=
revision
,
trust_remote_code
=
trust_remote_code
,
# use_fast=use_fast_tokenizer,
)
else
:
assert
isinstance
(
tokenizer
,
transformers
.
PreTrainedTokenizer
)
or
isinstance
(
tokenizer
,
transformers
.
PreTrainedTokenizerFast
)
return
tokenizer
# Get tokenizer based on 'pretrained'
if
isinstance
(
pretrained
,
str
):
model_name
=
pretrained
else
:
# get the HF hub name via accessor on model
model_name
=
self
.
model
.
name_or_path
self
.
processor
=
transformers
.
AutoProcessor
.
from_pretrained
(
model_name
,
revision
=
revision
,
trust_remote_code
=
trust_remote_code
,
# use_fast=use_fast_tokenizer,
)
self
.
tokenizer
=
self
.
processor
.
tokenizer
# def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
# """
# Method to apply a chat template to a list of chat history between user and model.
# """
# return self.tokenizer.apply_chat_template(
# chat_history, tokenize=False, add_generation_prompt=True
# )
# def tok_encode(
# self, string: str, left_truncate_len=None, add_special_tokens=None
# ) -> List[int]:
# """ """
# # default for None - empty dict, use predefined tokenizer param
# # used for all models except for CausalLM or predefined value
# special_tokens_kwargs = {}
# # by default for CausalLM - false or self.add_bos_token is set
# if add_special_tokens is None:
# if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
# special_tokens_kwargs = {
# "add_special_tokens": False or self.add_bos_token
# }
# # otherwise the method explicitly defines the value
# else:
# special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
# encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
# # left-truncate the encoded context to be at most `left_truncate_len` tokens long
# if left_truncate_len:
# encoding = encoding[-left_truncate_len:]
# return encoding
# def tok_batch_encode(
# self,
# strings: List[str],
# padding_side: str = "left",
# left_truncate_len: int = None,
# truncation: bool = False,
# ) -> Tuple[torch.Tensor, torch.Tensor]:
# # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
# old_padding_side = self.tokenizer.padding_side
# self.tokenizer.padding_side = padding_side
# add_special_tokens = {}
# if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
# add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
# encoding = self.tokenizer(
# strings,
# truncation=truncation,
# padding="longest",
# return_tensors="pt",
# **add_special_tokens,
# )
# if left_truncate_len:
# encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
# encoding["attention_mask"] = encoding["attention_mask"][
# :, -left_truncate_len:
# ]
# self.tokenizer.padding_side = old_padding_side
# return encoding["input_ids"], encoding["attention_mask"]
# def tok_decode(self, tokens, skip_special_tokens=True):
# return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
def
loglikelihood_rolling
(
self
,
requests
:
List
[
Instance
])
->
List
[
float
]:
raise
NotImplementedError
(
"model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks"
)
def
loglikelihood
(
self
,
requests
:
List
[
Instance
])
->
List
[
Tuple
[
float
,
bool
]]:
raise
NotImplementedError
(
"model type `hf-multimodal` does not support loglikelihood or multiple choice. Use 'hf' model type for text-only loglikelihood tasks"
)
def
flatten
(
self
,
input
):
new_list
=
[]
for
i
in
input
:
for
j
in
i
:
new_list
.
append
(
j
)
return
new_list
def
generate_until
(
self
,
requests
:
List
[
Instance
])
->
List
[
str
]:
res
=
[]
def
_collate
(
x
):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks
=
self
.
tok_encode
(
x
[
0
])
return
-
len
(
toks
),
x
[
0
]
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords
=
utils
.
Collator
(
[
reg
.
args
for
reg
in
requests
],
_collate
,
grouping
=
True
)
chunks
=
re_ords
.
get_batched
(
n
=
self
.
batch_size
,
batch_fn
=
None
)
num_iters
=
(
len
(
requests
)
//
self
.
batch_size
if
len
(
requests
)
%
self
.
batch_size
==
0
else
len
(
requests
)
//
self
.
batch_size
+
1
)
pbar
=
tqdm
(
total
=
num_iters
,
disable
=
(
self
.
rank
!=
0
),
desc
=
"Model Responding"
)
for
chunk
in
chunks
:
contexts
,
all_gen_kwargs
,
doc_to_visual
,
doc
,
task
=
zip
(
*
chunk
)
# TODO: understand what is going on here. can we cut down on number of distinct things we pass around?
task
=
task
[
0
]
# split = split[0]
visuals
=
[
vis
(
d
)
for
vis
,
d
in
zip
(
doc_to_visual
,
doc
)]
# visuals = self.flatten(visuals)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs
=
all_gen_kwargs
[
0
]
# Set default values for until and max_new_tokens
until
=
[
self
.
tok_decode
(
self
.
eot_token_id
)]
# Update values from gen_kwargs if present
if
"until"
in
gen_kwargs
:
until
=
gen_kwargs
.
pop
(
"until"
)
if
isinstance
(
until
,
str
):
until
=
[
until
]
elif
not
isinstance
(
until
,
list
):
raise
ValueError
(
f
"Expected `gen_kwargs['until']` to be of type Union[str,list] but got
{
type
(
until
)
}
"
)
assert
(
self
.
batch_size_per_gpu
==
1
),
"Do not support batch_size_per_gpu > 1 for now"
context
=
contexts
[
0
]
# if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
print
(
f
"Prompt:
\n\n
{
contexts
}
\n
"
)
self
.
tokenizer
.
padding_side
=
"left"
inputs
=
self
.
processor
(
images
=
visuals
,
text
=
contexts
,
return_tensors
=
"pt"
,
padding
=
True
).
to
(
self
.
_device
,
self
.
model
.
dtype
)
# TODO:
# gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
if
"max_new_tokens"
not
in
gen_kwargs
:
gen_kwargs
[
"max_new_tokens"
]
=
1024
if
"temperature"
not
in
gen_kwargs
:
gen_kwargs
[
"temperature"
]
=
0
if
"top_p"
not
in
gen_kwargs
:
gen_kwargs
[
"top_p"
]
=
None
if
"num_beams"
not
in
gen_kwargs
:
gen_kwargs
[
"num_beams"
]
=
1
try
:
cont
=
self
.
model
.
generate
(
**
inputs
,
do_sample
=
True
if
gen_kwargs
[
"temperature"
]
>
0
else
False
,
temperature
=
gen_kwargs
[
"temperature"
],
top_p
=
gen_kwargs
[
"top_p"
],
num_beams
=
gen_kwargs
[
"num_beams"
],
max_new_tokens
=
gen_kwargs
[
"max_new_tokens"
],
use_cache
=
True
,
pad_token_id
=
self
.
tokenizer
.
eos_token_id
,
)
except
Exception
as
e
:
print
(
f
"Error
{
e
}
in generating"
)
cont
=
""
cont_toks_list
=
cont
.
tolist
()
for
cont_toks
,
context
in
zip
(
cont_toks_list
,
contexts
):
# discard context + left-padding toks if using causal decoder-only LM
# if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: # TODO: ensure this holds for VLMs
cont_toks
=
cont_toks
[
inputs
[
"input_ids"
].
shape
[
1
]
:]
s
=
self
.
tok_decode
(
cont_toks
)
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for
term
in
until
:
if
len
(
term
)
>
0
:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
s
=
s
.
split
(
term
)[
0
]
if
"1.5"
in
self
.
pretrained
:
text_outputs
=
s
.
split
(
"ASSISTANT:"
)[
-
1
].
strip
()
elif
"mistral"
in
self
.
pretrained
:
text_outputs
=
s
.
split
(
"[/INST]"
)[
-
1
].
strip
()
else
:
text_outputs
=
s
.
split
(
"ASSISTANT:"
)[
-
1
].
strip
()
# if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
print
(
"hi hi"
)
print
(
f
"Generated text:
\n\n
{
text_outputs
}
\n
"
)
res
.
append
(
text_outputs
)
self
.
cache_hook
.
add_partial
(
"generate_until"
,
(
context
,
gen_kwargs
),
text_outputs
)
pbar
.
update
(
1
)
# reorder this group of results back to original unsorted form
res
=
re_ords
.
get_original
(
res
)
pbar
.
close
()
return
res
lm_eval/models/huggingface.py
View file @
6e3b2ea1
...
...
@@ -439,7 +439,14 @@ class HFLM(TemplateLM):
Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
"""
# escape hatch: if we're using a subclass that shouldn't follow
# the default _get_backend logic,
# then skip over the method.
if
self
.
AUTO_MODEL_CLASS
is
not
None
:
return
assert
backend
in
[
"default"
,
"causal"
,
"seq2seq"
]
if
backend
!=
"default"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment