Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
1fa02395
Unverified
Commit
1fa02395
authored
Jun 22, 2023
by
Lintang Sutawika
Committed by
GitHub
Jun 22, 2023
Browse files
Merge pull request #565 from fattorib/seq2seq-refactor
[Refactor] Seq2Seq Models with Multi-Device Support
parents
9a8fee14
d3cfdcf6
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
728 additions
and
27 deletions
+728
-27
lm_eval/api/task.py
lm_eval/api/task.py
+10
-2
lm_eval/evaluator.py
lm_eval/evaluator.py
+1
-3
lm_eval/models/__init__.py
lm_eval/models/__init__.py
+1
-0
lm_eval/models/anthropic_llms.py
lm_eval/models/anthropic_llms.py
+1
-2
lm_eval/models/hf_causal.py
lm_eval/models/hf_causal.py
+34
-18
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+559
-0
lm_eval/tasks/super_glue/boolq/default.yaml
lm_eval/tasks/super_glue/boolq/default.yaml
+8
-2
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+18
-0
lm_eval/utils.py
lm_eval/utils.py
+96
-0
No files found.
lm_eval/api/task.py
View file @
1fa02395
...
...
@@ -98,13 +98,16 @@ class TaskConfig(dict):
if
type
(
self
.
gold_alias
)
==
str
:
self
.
gold_alias
=
self
.
template_aliases
+
self
.
gold_alias
if
self
.
generation_kwargs
or
self
.
output_type
==
"greedy_until"
:
if
self
.
generation_kwargs
:
assert
(
self
.
output_type
==
"greedy_until"
),
"passed `generation_kwargs`, but not using a generation request type!"
elif
self
.
output_type
==
"greedy_until"
:
# ensure that we greedily generate in absence of explicit arguments otherwise
self
.
generation_kwargs
=
{
"do_sample"
:
False
,
"temperature"
:
0.0
}
# TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
def
__getitem__
(
self
,
item
):
return
getattr
(
self
,
item
)
...
...
@@ -123,6 +126,9 @@ class TaskConfig(dict):
for
k
,
v
in
list
(
cfg_dict
.
items
()):
if
v
is
None
:
cfg_dict
.
pop
(
k
)
elif
isinstance
(
v
,
Callable
):
# TODO: this should handle Promptsource template objects as a separate case?
cfg_dict
[
k
]
=
str
(
v
)
return
cfg_dict
...
...
@@ -877,7 +883,9 @@ class ConfigurableTask(Task):
for
key
,
result
in
zip
(
self
.
_metric_fn_list
.
keys
(),
results
):
_dict
=
self
.
_metric_fn_list
[
key
].
compute
(
references
=
[
gold
],
predictions
=
[
result
],
**
self
.
_metric_kwargs
[
key
]
references
=
[
gold
],
predictions
=
[
result
],
**
self
.
_metric_fn_kwargs
[
key
],
)
result_dict
=
{
**
result_dict
,
**
_dict
}
...
...
lm_eval/evaluator.py
View file @
1fa02395
...
...
@@ -183,9 +183,7 @@ def evaluate(
# get lists of each type of request
for
task_name
,
task
in
task_dict
.
items
():
versions
[
task_name
]
=
task
.
VERSION
configs
[
task_name
]
=
dict
(
task
.
dump_config
()
)
# TODO: don't access a private attribute here ; for non-YAML tasks handle this case
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
# task_docs = list(task_doc_func())
...
...
lm_eval/models/__init__.py
View file @
1fa02395
...
...
@@ -2,5 +2,6 @@ from . import hf_causal
from
.
import
openai_completions
from
.
import
textsynth
from
.
import
dummy
from
.
import
huggingface
# TODO: implement __all__
lm_eval/models/anthropic_llms.py
View file @
1fa02395
...
...
@@ -26,7 +26,6 @@ def anthropic_completion(
max_tokens_to_sample
=
max_tokens_to_sample
,
temperature
=
temperature
,
)
print
(
response
)
return
response
[
"completion"
]
except
RuntimeError
:
# TODO: I don't actually know what error Anthropic raises when it times out
...
...
@@ -99,7 +98,7 @@ class AnthropicLM(LM):
model
=
self
.
model
,
prompt
=
inp
,
max_tokens_to_sample
=
self
.
max_gen_toks
,
temperature
=
0.0
,
temperature
=
0.0
,
# TODO: implement non-greedy sampling for Anthropic
stop
=
until
,
)
res
.
append
(
response
)
...
...
lm_eval/models/hf_causal.py
View file @
1fa02395
...
...
@@ -11,12 +11,14 @@ from lm_eval.logger import eval_logger
from
lm_eval.api.model
import
LM
from
lm_eval.api.registry
import
register_model
from
lm_eval.utils
import
MultiTokenEOSCriteria
,
stop_sequences_criteria
from
accelerate
import
Accelerator
from
typing
import
Optional
,
Union
@
register_model
(
"hf-causal"
)
class
HFLM
(
LM
):
class
HF
Causal
LM
(
LM
):
def
__init__
(
self
,
device
=
"cuda"
,
...
...
@@ -35,6 +37,7 @@ class HFLM(LM):
assert
isinstance
(
batch_size
,
int
)
gpus
=
torch
.
cuda
.
device_count
()
if
gpus
<=
1
:
if
device
:
if
device
not
in
[
"cuda"
,
"cpu"
]:
...
...
@@ -66,7 +69,7 @@ class HFLM(LM):
).
to
(
self
.
device
)
self
.
model
.
eval
()
print
(
self
.
model
.
dtype
)
eval_logger
.
info
(
self
.
model
.
dtype
)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
pretrained
if
tokenizer
is
None
else
tokenizer
,
...
...
@@ -90,6 +93,14 @@ class HFLM(LM):
)
self
.
_rank
=
accelerator
.
local_process_index
self
.
_world_size
=
accelerator
.
num_processes
# manually set model to use gpu, for case where many GPUs available but
# only seek to use one
self
.
_device
=
(
torch
.
device
(
f
"cuda:
{
accelerator
.
local_process_index
}
"
)
if
torch
.
cuda
.
is_available
()
else
torch
.
device
(
"cpu"
)
)
self
.
model
.
to
(
self
.
device
)
else
:
self
.
model
=
accelerator
.
prepare
(
self
.
model
)
self
.
_device
=
torch
.
device
(
f
"cuda:
{
accelerator
.
local_process_index
}
"
)
...
...
@@ -157,27 +168,33 @@ class HFLM(LM):
logits returned from the model
"""
with
torch
.
no_grad
():
return
self
.
model
(
inps
)
[
0
]
return
self
.
model
(
inps
)
.
logits
def
_model_generate
(
self
,
context
,
max_length
,
eos_token_id
,
**
generation_kwargs
):
def
_model_generate
(
self
,
context
,
max_length
,
stop
,
**
generation_kwargs
):
# we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search.
if
"do_sample"
not
in
generation_kwargs
.
keys
():
generation_kwargs
[
"do_sample"
]
=
False
# build stopping criteria
stopping_criteria
=
stop_sequences_criteria
(
self
.
tokenizer
,
stop
,
1
,
context
.
shape
[
0
]
)
if
hasattr
(
self
,
"accelerator"
):
return
self
.
accelerator
.
unwrap_model
(
self
.
model
).
generate
(
context
,
max_length
=
max_length
,
pad_token_id
=
eos_token_id
,
eos_token_id
=
eos_token_id
,
stopping_criteria
=
stopping_criteria
,
pad_token_id
=
self
.
eot_token_id
,
use_cache
=
True
,
**
generation_kwargs
,
)
else
:
return
self
.
model
.
generate
(
context
,
max_length
=
max_length
,
pad_token_id
=
eos_token_id
,
eos_token_id
=
eos_token_id
,
stopping_criteria
=
stopping_criteria
,
pad_token_id
=
self
.
eot_token_id
,
use_cache
=
True
,
**
generation_kwargs
,
)
...
...
@@ -197,9 +214,6 @@ class HFLM(LM):
return
self
.
_loglikelihood_tokens
(
new_reqs
)
def
loglikelihood_rolling
(
self
,
requests
):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods
=
[]
for
(
string
,)
in
tqdm
([
req
.
args
for
req
in
requests
],
disable
=
(
self
.
rank
!=
0
)):
rolling_token_windows
=
list
(
...
...
@@ -368,6 +382,7 @@ class HFLM(LM):
re_ord
=
utils
.
Reorderer
([
req
.
args
for
req
in
requests
],
_collate
)
for
context
,
gen_kwargs
in
tqdm
(
re_ord
.
get_reordered
()):
until
=
None
if
isinstance
(
gen_kwargs
,
dict
):
gen_kwargs
=
copy
.
deepcopy
(
gen_kwargs
)
# edge case for repeats > 1
if
"until"
in
gen_kwargs
.
keys
():
...
...
@@ -389,12 +404,13 @@ class HFLM(LM):
else
:
max_gen_toks
=
self
.
max_gen_toks
try
:
(
primary_until
,)
=
self
.
tok_encode
(
until
[
0
])
except
Exception
:
# if our primary until would be multiple tokens long, we'll have errors.
# TODO: handling this better will let us stop generating earlier + often.
primary_until
=
self
.
eot_token_id
primary_until
=
until
[
0
]
# try:
# (primary_until,) = self.tok_encode(until[0])
# except Exception:
# # if our primary until would be multiple tokens long, we'll have errors.
# # TODO: handling this better will let us stop generating earlier + often.
# primary_until = self.eot_token_id
context_enc
=
torch
.
tensor
(
[
self
.
tok_encode
(
context
)[
max_gen_toks
-
self
.
max_length
:]]
...
...
@@ -403,7 +419,7 @@ class HFLM(LM):
cont
=
self
.
_model_generate
(
context
=
context_enc
,
max_length
=
context_enc
.
shape
[
1
]
+
max_gen_toks
,
eos_token_id
=
primary_until
,
stop
=
primary_until
,
**
gen_kwargs
,
)
...
...
lm_eval/models/huggingface.py
0 → 100644
View file @
1fa02395
This diff is collapsed.
Click to expand it.
lm_eval/tasks/super_glue/boolq/default.yaml
View file @
1fa02395
group
:
-
super-glue-lm-eval-v1
task
:
"
default
"
task
:
"
boolq
"
dataset_path
:
super_glue
dataset_name
:
boolq
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
{{passage}}
\n
Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{answer_choices[labe]}}"
doc_to_target
:
"
{{answer_choices[labe
l
]}}"
gold_alias
:
"
{{label}}"
# this will be cast to an int.
template_aliases
:
"
{%
set
answer_choices
=
['no',
'yes']
%}"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
0 → 100644
View file @
1fa02395
group
:
-
super-glue-lm-eval-v1
task
:
"
boolq-seq2seq"
dataset_path
:
super_glue
dataset_name
:
boolq
output_type
:
greedy_until
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
{{passage}}
\n
Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{answer_choices[label]}}"
gold_alias
:
"
{{label}}"
# this will be cast to an int.
template_aliases
:
"
{%
set
answer_choices
=
['no',
'yes']
%}"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
lm_eval/utils.py
View file @
1fa02395
...
...
@@ -14,6 +14,7 @@ from typing import List, Union
import
gc
import
torch
import
transformers
from
omegaconf
import
OmegaConf
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
...
...
@@ -422,6 +423,51 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
return
islice
(
raw_iterator
,
rank
,
limit
,
world_size
)
def
pad_and_concat
(
max_length
:
int
,
tensors
:
List
[
torch
.
Tensor
],
padding_side
=
"right"
):
"""
Method for padding a list of tensors given the maximum tensor
length in the batch. Used for batching inputs and continuations in
seq2seq models.
"""
assert
(
padding_side
==
"left"
or
padding_side
==
"right"
),
f
"Unrecognized padding type: '
{
padding_side
}
' not 'left' or 'right'"
for
i
,
tensor
in
enumerate
(
tensors
):
tensor_len
=
tensor
.
shape
[
0
]
if
tensor_len
<
max_length
:
if
padding_side
==
"right"
:
# right-pad
tensors
[
i
]
=
torch
.
cat
(
[
tensor
,
# [seq]
torch
.
zeros
(
max_length
-
tensor_len
,
dtype
=
torch
.
long
,
device
=
tensor
.
device
,
),
# [padding_length - seq]
],
dim
=
0
,
).
unsqueeze
(
0
)
else
:
# left-pad
tensors
[
i
]
=
torch
.
cat
(
[
torch
.
zeros
(
max_length
-
tensor_len
,
dtype
=
torch
.
long
,
device
=
tensor
.
device
,
),
# [padding_length - seq]
tensor
,
# [seq]
],
dim
=
0
,
).
unsqueeze
(
0
)
else
:
tensors
[
i
]
=
tensor
.
unsqueeze
(
0
)
return
torch
.
cat
(
tensors
,
dim
=
0
)
def
clear_torch_cache
():
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
...
...
@@ -435,3 +481,53 @@ def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
else
:
_torch_dtype
=
dtype
return
_torch_dtype
# Multi-token stopping criteria
class
MultiTokenEOSCriteria
(
transformers
.
StoppingCriteria
):
"""Criteria to stop on the specified multi-token sequence."""
def
__init__
(
self
,
sequence
:
str
,
tokenizer
:
transformers
.
PreTrainedTokenizer
,
initial_decoder_input_length
:
int
,
batch_size
:
int
,
):
self
.
initial_decoder_input_length
=
initial_decoder_input_length
self
.
done_tracker
=
[
False
]
*
batch_size
self
.
sequence
=
sequence
self
.
sequence_ids
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
False
)
self
.
sequence_id_len
=
len
(
self
.
sequence_ids
)
self
.
tokenizer
=
tokenizer
def
__call__
(
self
,
input_ids
,
scores
,
**
kwargs
)
->
bool
:
# For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
lookback_ids_batch
=
input_ids
[:,
self
.
initial_decoder_input_length
:][
:,
-
self
.
sequence_id_len
:
]
lookback_tokens_batch
=
self
.
tokenizer
.
batch_decode
(
lookback_ids_batch
)
for
i
,
done
in
enumerate
(
self
.
done_tracker
):
if
not
done
:
self
.
done_tracker
[
i
]
=
self
.
sequence
in
lookback_tokens_batch
[
i
]
return
False
not
in
self
.
done_tracker
def
stop_sequences_criteria
(
tokenizer
:
transformers
.
PreTrainedTokenizer
,
stop_sequences
:
List
[
str
],
initial_decoder_input_length
:
int
,
batch_size
:
int
,
)
->
transformers
.
StoppingCriteriaList
:
return
transformers
.
StoppingCriteriaList
(
[
*
[
MultiTokenEOSCriteria
(
sequence
,
tokenizer
,
initial_decoder_input_length
,
batch_size
)
for
sequence
in
stop_sequences
],
]
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment