Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d4c5315a
Commit
d4c5315a
authored
May 05, 2023
by
Benjamin Fattori
Browse files
sync working changes with upstream
parent
2da74953
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
153 additions
and
75 deletions
+153
-75
lm_eval/api/task.py
lm_eval/api/task.py
+4
-3
lm_eval/evaluator.py
lm_eval/evaluator.py
+87
-30
lm_eval/models/gpt2.py
lm_eval/models/gpt2.py
+37
-18
main.py
main.py
+25
-24
No files found.
lm_eval/api/task.py
View file @
d4c5315a
...
@@ -248,7 +248,7 @@ class Task(abc.ABC):
...
@@ -248,7 +248,7 @@ class Task(abc.ABC):
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
pass
pass
def
build_all_requests
(
self
,
limit
=
None
):
def
build_all_requests
(
self
,
limit
=
None
,
rank
=
None
,
world_size
=
None
):
"""Build a set of Instances for a task, and store them in task.instances"""
"""Build a set of Instances for a task, and store them in task.instances"""
if
self
.
has_test_docs
():
if
self
.
has_test_docs
():
docs
=
self
.
test_docs
()
docs
=
self
.
test_docs
()
...
@@ -260,8 +260,9 @@ class Task(abc.ABC):
...
@@ -260,8 +260,9 @@ class Task(abc.ABC):
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
instances
=
[]
instances
=
[]
for
doc_id
,
doc
in
enumerate
(
itertools
.
islice
(
docs
,
0
,
limit
)
if
limit
else
docs
):
# for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
# sample fewshot context
for
doc_id
,
doc
in
itertools
.
islice
(
enumerate
(
docs
),
rank
,
None
,
world_size
):
# sample fewshot context #TODO: need to offset doc_id by rank now!
fewshot_ctx
=
self
.
fewshot_context
(
fewshot_ctx
=
self
.
fewshot_context
(
doc
,
self
.
_config
.
num_fewshot
,
rnd
=
random
.
Random
()
doc
,
self
.
_config
.
num_fewshot
,
rnd
=
random
.
Random
()
)
)
...
...
lm_eval/evaluator.py
View file @
d4c5315a
...
@@ -7,7 +7,7 @@ import lm_eval.models
...
@@ -7,7 +7,7 @@ import lm_eval.models
import
lm_eval.tasks
import
lm_eval.tasks
import
lm_eval.api
import
lm_eval.api
from
lm_eval.utils
import
positional_deprecated
,
run_task_tests
,
make_table
from
lm_eval.utils
import
positional_deprecated
,
run_task_tests
,
make_table
import
torch
@
positional_deprecated
@
positional_deprecated
def
simple_evaluate
(
def
simple_evaluate
(
...
@@ -79,19 +79,23 @@ def simple_evaluate(
...
@@ -79,19 +79,23 @@ def simple_evaluate(
decontamination_ngrams_path
=
decontamination_ngrams_path
,
decontamination_ngrams_path
=
decontamination_ngrams_path
,
)
)
# add info about the model and few shot config
if
lm
.
rank
==
0
:
results
[
"config"
]
=
{
# add info about the model and few shot config
"model"
:
model
,
results
[
"config"
]
=
{
"model_args"
:
model_args
,
"model"
:
model
,
"num_fewshot"
:
num_fewshot
,
"model_args"
:
model_args
,
"batch_size"
:
batch_size
,
"num_fewshot"
:
num_fewshot
,
"device"
:
device
,
"batch_size"
:
batch_size
,
"no_cache"
:
no_cache
,
"device"
:
device
,
"limit"
:
limit
,
"no_cache"
:
no_cache
,
"bootstrap_iters"
:
bootstrap_iters
,
"limit"
:
limit
,
}
"bootstrap_iters"
:
bootstrap_iters
,
}
return
results
else
:
return
None
return
results
decontaminate_suffix
=
"_decontaminate"
decontaminate_suffix
=
"_decontaminate"
...
@@ -143,10 +147,20 @@ def evaluate(
...
@@ -143,10 +147,20 @@ def evaluate(
# rnd.shuffle(task_docs)
# rnd.shuffle(task_docs)
# for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
# for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
task
.
build_all_requests
(
limit
=
limit
)
task
.
build_all_requests
(
limit
=
limit
,
rank
=
lm
.
rank
,
world_size
=
lm
.
world_size
)
# aggregate Instances by LM method requested to get output.
# aggregate Instances by LM method requested to get output.
reqtype
=
"loglikelihood"
if
task
.
OUTPUT_TYPE
==
"multiple_choice"
else
task
.
OUTPUT_TYPE
#TODO: this is hacky, fix in task.py
reqtype
=
"loglikelihood"
if
task
.
OUTPUT_TYPE
==
"multiple_choice"
else
task
.
OUTPUT_TYPE
#TODO: this is hacky, fix in task.py
requests
[
reqtype
].
extend
(
task
.
instances
)
requests
[
reqtype
].
extend
(
task
.
instances
)
if
lm
.
world_size
>
1
:
instances_rnk
=
torch
.
tensor
(
len
(
task
.
_instances
),
device
=
lm
.
device
)
gathered_item
=
lm
.
accelerator
.
gather
(
instances_rnk
).
cpu
().
detach
().
numpy
().
tolist
()
# compute number of pseudobatches to pad with (FSDP/DDP require even batches + can't use join)
# we assume rank 0 always has largest iterator
numpad
=
gathered_item
[
0
]
-
gathered_item
[
lm
.
rank
]
if
numpad
>
0
:
print
(
f
"
{
task_name
}
/ balancing iterators across ranks / rank:
{
lm
.
rank
}
/ +
{
numpad
}
sample"
)
### Run LM on inputs, get all outputs ###
### Run LM on inputs, get all outputs ###
# execute each type of request
# execute each type of request
...
@@ -157,6 +171,10 @@ def evaluate(
...
@@ -157,6 +171,10 @@ def evaluate(
for
req
in
reqs
:
for
req
in
reqs
:
cloned_reqs
.
extend
([
req
]
*
req
.
repeats
)
cloned_reqs
.
extend
([
req
]
*
req
.
repeats
)
if
(
lm
.
rank
>
0
)
and
(
numpad
>
0
):
for
_
in
range
(
numpad
):
cloned_reqs
.
extend
([
req
]
*
req
.
repeats
)
# run requests through model
# run requests through model
resps
=
getattr
(
lm
,
reqtype
)(
cloned_reqs
)
resps
=
getattr
(
lm
,
reqtype
)(
cloned_reqs
)
...
@@ -164,6 +182,9 @@ def evaluate(
...
@@ -164,6 +182,9 @@ def evaluate(
for
x
,
req
in
zip
(
resps
,
cloned_reqs
):
for
x
,
req
in
zip
(
resps
,
cloned_reqs
):
req
.
resps
.
append
(
x
)
req
.
resps
.
append
(
x
)
if
lm
.
world_size
>
1
:
lm
.
accelerator
.
wait_for_everyone
()
### Postprocess outputs ###
### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
...
@@ -187,25 +208,61 @@ def evaluate(
...
@@ -187,25 +208,61 @@ def evaluate(
for
metric
,
value
in
metrics
.
items
():
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
if
lm
.
world_size
>
1
:
# if multigpu, then gather data across all ranks
vals_torch
=
collections
.
defaultdict
(
list
)
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
numitem
=
0
if
type
(
items
[
0
])
==
tuple
:
numitem
=
len
(
items
[
0
])
# distributed gather requires all ranks to have same dimensionality -> pad out with float32 min value
pad_value
=
torch
.
finfo
(
torch
.
float32
).
min
metrics_tensor
=
torch
.
tensor
(
items
,
device
=
lm
.
device
)
original_dtype
=
metrics_tensor
.
dtype
# store original dtype
torch_device_tensor
=
lm
.
accelerator
.
pad_across_processes
(
metrics_tensor
.
to
(
torch
.
float32
),
pad_index
=
pad_value
)
gathered_item
=
lm
.
accelerator
.
gather
(
torch_device_tensor
)
#TODO: This is required when we get a tensor with a tuple of info like (ppl, _bytes) from wikitext
if
numitem
>
0
:
gathered_filtered
=
gathered_item
[
gathered_item
[:,
0
]
!=
pad_value
]
else
:
gathered_filtered
=
gathered_item
[
gathered_item
!=
pad_value
]
gathered_item
=
gathered_filtered
.
to
(
original_dtype
).
cpu
().
detach
().
numpy
().
tolist
()
# reconvert if we were passed a tuple of values
if
numitem
>
0
:
gathered_item
=
[
tuple
(
g
)
for
g
in
gathered_item
]
if
lm
.
rank
==
0
:
vals_torch
[(
task_name
,
key
,
metric
)]
=
gathered_item
vals
=
vals_torch
### Aggregate results over all datapoints ###
if
lm
.
rank
==
0
:
# aggregate results ; run bootstrap CIs
### Aggregate results over all datapoints ###
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
# aggregate results ; run bootstrap CIs
task
=
task_dict
[
task_name
]
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
results
[
task_name
][
metric
+
" - filter="
+
key
]
=
task
.
aggregation
()[
metric
](
items
)
task
=
task_dict
[
task_name
]
results
[
task_name
][
metric
+
" - filter="
+
key
]
=
task
.
aggregation
()[
metric
](
items
)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
# so we run them less iterations. still looking for a cleaner way to do this
stderr
=
lm_eval
.
api
.
metrics
.
stderr_for_metric
(
stderr
=
lm_eval
.
api
.
metrics
.
stderr_for_metric
(
metric
=
task
.
aggregation
()[
metric
],
metric
=
task
.
aggregation
()[
metric
],
bootstrap_iters
=
min
(
bootstrap_iters
,
1000
)
bootstrap_iters
=
min
(
bootstrap_iters
,
1000
)
if
metric
in
[
"bleu"
,
"chrf"
,
"ter"
]
if
metric
in
[
"bleu"
,
"chrf"
,
"ter"
]
else
bootstrap_iters
,
else
bootstrap_iters
,
)
)
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
" - filter="
+
key
+
"_stderr"
]
=
stderr
(
items
)
if
stderr
is
not
None
:
return
{
"results"
:
dict
(
results
),
"versions"
:
dict
(
versions
)}
results
[
task_name
][
metric
+
" - filter="
+
key
+
"_stderr"
]
=
stderr
(
items
)
return
{
"results"
:
dict
(
results
),
"versions"
:
dict
(
versions
)}
else
:
return
None
lm_eval/models/gpt2.py
View file @
d4c5315a
...
@@ -8,6 +8,8 @@ import torch.nn.functional as F
...
@@ -8,6 +8,8 @@ import torch.nn.functional as F
from
lm_eval
import
utils
from
lm_eval
import
utils
from
lm_eval.api.model
import
LM
,
register_model
from
lm_eval.api.model
import
LM
,
register_model
from
accelerate
import
Accelerator
from
itertools
import
islice
@
register_model
(
"hf-causal"
,
"gpt2"
)
@
register_model
(
"hf-causal"
,
"gpt2"
)
class
HFLM
(
LM
):
class
HFLM
(
LM
):
...
@@ -26,20 +28,22 @@ class HFLM(LM):
...
@@ -26,20 +28,22 @@ class HFLM(LM):
assert
isinstance
(
device
,
str
)
assert
isinstance
(
device
,
str
)
assert
isinstance
(
pretrained
,
str
)
assert
isinstance
(
pretrained
,
str
)
assert
isinstance
(
batch_size
,
int
)
assert
isinstance
(
batch_size
,
int
)
if
device
:
gpus
=
torch
.
cuda
.
device_count
()
if
device
not
in
[
"cuda"
,
"cpu"
]:
if
gpus
<=
1
:
device
=
int
(
device
)
if
device
:
self
.
_device
=
torch
.
device
(
device
)
if
device
not
in
[
"cuda"
,
"cpu"
]:
print
(
f
"Using device '
{
device
}
'"
)
device
=
int
(
device
)
else
:
self
.
_device
=
torch
.
device
(
device
)
print
(
"Device not specified"
)
print
(
f
"Using device '
{
device
}
'"
)
print
(
f
"Cuda Available?
{
torch
.
cuda
.
is_available
()
}
"
)
else
:
self
.
_device
=
(
print
(
"Device not specified"
)
torch
.
device
(
"cuda"
)
print
(
f
"Cuda Available?
{
torch
.
cuda
.
is_available
()
}
"
)
if
torch
.
cuda
.
is_available
()
self
.
_device
=
(
else
torch
.
device
(
"cpu"
)
torch
.
device
(
"cuda"
)
)
if
torch
.
cuda
.
is_available
()
else
torch
.
device
(
"cpu"
)
)
# TODO: update this to be less of a hack once subfolder is fixed in HF
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision
=
revision
+
(
"/"
+
subfolder
if
subfolder
is
not
None
else
""
)
revision
=
revision
+
(
"/"
+
subfolder
if
subfolder
is
not
None
else
""
)
...
@@ -59,10 +63,17 @@ class HFLM(LM):
...
@@ -59,10 +63,17 @@ class HFLM(LM):
# multithreading and batching
# multithreading and batching
self
.
batch_size_per_gpu
=
batch_size
# todo: adaptive batch size
self
.
batch_size_per_gpu
=
batch_size
# todo: adaptive batch size
# TODO: fix multi-gpu
if
gpus
>
1
:
# gpus = torch.cuda.device_count()
accelerator
=
Accelerator
(
device_placement
=
False
)
# if gpus > 1:
self
.
gpt2
=
accelerator
.
prepare
(
self
.
gpt2
)
# self.gpt2 = nn.DataParallel(self.gpt2)
self
.
_device
=
torch
.
device
(
f
"cuda:
{
accelerator
.
local_process_index
}
"
)
self
.
accelerator
=
accelerator
if
self
.
accelerator
.
is_local_main_process
:
print
(
f
"Using
{
gpus
}
GPUs with FullyShardedDataParalell and accelerate"
)
self
.
_rank
=
self
.
accelerator
.
local_process_index
self
.
_world_size
=
gpus
@
property
@
property
def
eot_token_id
(
self
):
def
eot_token_id
(
self
):
...
@@ -90,6 +101,14 @@ class HFLM(LM):
...
@@ -90,6 +101,14 @@ class HFLM(LM):
def
device
(
self
):
def
device
(
self
):
# TODO: fix multi-gpu
# TODO: fix multi-gpu
return
self
.
_device
return
self
.
_device
@
property
def
rank
(
self
):
return
self
.
_rank
@
property
def
world_size
(
self
):
return
self
.
_world_size
def
tok_encode
(
self
,
string
:
str
):
def
tok_encode
(
self
,
string
:
str
):
return
self
.
tokenizer
.
encode
(
string
,
add_special_tokens
=
False
)
return
self
.
tokenizer
.
encode
(
string
,
add_special_tokens
=
False
)
...
...
main.py
View file @
d4c5315a
...
@@ -89,30 +89,31 @@ def main():
...
@@ -89,30 +89,31 @@ def main():
print
(
f
"Selected Tasks:
{
task_names
}
"
)
print
(
f
"Selected Tasks:
{
task_names
}
"
)
results
=
evaluator
.
simple_evaluate
(
if
results
is
not
None
:
model
=
args
.
model
,
results
=
evaluator
.
simple_evaluate
(
model_args
=
args
.
model_args
,
model
=
args
.
model
,
tasks
=
task_names
,
model_args
=
args
.
model_args
,
num_fewshot
=
args
.
num_fewshot
,
tasks
=
task_names
,
batch_size
=
args
.
batch_size
,
num_fewshot
=
args
.
num_fewshot
,
device
=
args
.
device
,
batch_size
=
args
.
batch_size
,
limit
=
args
.
limit
,
device
=
args
.
device
,
decontamination_ngrams_path
=
args
.
decontamination_ngrams_path
,
limit
=
args
.
limit
,
check_integrity
=
args
.
check_integrity
,
decontamination_ngrams_path
=
args
.
decontamination_ngrams_path
,
)
check_integrity
=
args
.
check_integrity
,
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
print
(
dumped
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
print
(
dumped
)
if
args
.
output_path
:
with
open
(
args
.
output_path
,
"w"
)
as
f
:
if
args
.
output_path
:
f
.
write
(
dumped
)
with
open
(
args
.
output_path
,
"w"
)
as
f
:
f
.
write
(
dumped
)
print
(
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, provide_description:
{
args
.
provide_description
}
, "
print
(
f
"num_fewshot:
{
args
.
num_fewshot
}
, batch_size:
{
args
.
batch_size
}
"
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, provide_description:
{
args
.
provide_description
}
, "
)
f
"num_fewshot:
{
args
.
num_fewshot
}
, batch_size:
{
args
.
batch_size
}
"
print
(
evaluator
.
make_table
(
results
))
)
print
(
evaluator
.
make_table
(
results
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment