Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d4c5315a
Commit
d4c5315a
authored
May 05, 2023
by
Benjamin Fattori
Browse files
sync working changes with upstream
parent
2da74953
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
153 additions
and
75 deletions
+153
-75
lm_eval/api/task.py
lm_eval/api/task.py
+4
-3
lm_eval/evaluator.py
lm_eval/evaluator.py
+87
-30
lm_eval/models/gpt2.py
lm_eval/models/gpt2.py
+37
-18
main.py
main.py
+25
-24
No files found.
lm_eval/api/task.py
View file @
d4c5315a
...
@@ -248,7 +248,7 @@ class Task(abc.ABC):
...
@@ -248,7 +248,7 @@ class Task(abc.ABC):
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
pass
pass
def
build_all_requests
(
self
,
limit
=
None
):
def
build_all_requests
(
self
,
limit
=
None
,
rank
=
None
,
world_size
=
None
):
"""Build a set of Instances for a task, and store them in task.instances"""
"""Build a set of Instances for a task, and store them in task.instances"""
if
self
.
has_test_docs
():
if
self
.
has_test_docs
():
docs
=
self
.
test_docs
()
docs
=
self
.
test_docs
()
...
@@ -260,8 +260,9 @@ class Task(abc.ABC):
...
@@ -260,8 +260,9 @@ class Task(abc.ABC):
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
instances
=
[]
instances
=
[]
for
doc_id
,
doc
in
enumerate
(
itertools
.
islice
(
docs
,
0
,
limit
)
if
limit
else
docs
):
# for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
# sample fewshot context
for
doc_id
,
doc
in
itertools
.
islice
(
enumerate
(
docs
),
rank
,
None
,
world_size
):
# sample fewshot context #TODO: need to offset doc_id by rank now!
fewshot_ctx
=
self
.
fewshot_context
(
fewshot_ctx
=
self
.
fewshot_context
(
doc
,
self
.
_config
.
num_fewshot
,
rnd
=
random
.
Random
()
doc
,
self
.
_config
.
num_fewshot
,
rnd
=
random
.
Random
()
)
)
...
...
lm_eval/evaluator.py
View file @
d4c5315a
...
@@ -7,7 +7,7 @@ import lm_eval.models
...
@@ -7,7 +7,7 @@ import lm_eval.models
import
lm_eval.tasks
import
lm_eval.tasks
import
lm_eval.api
import
lm_eval.api
from
lm_eval.utils
import
positional_deprecated
,
run_task_tests
,
make_table
from
lm_eval.utils
import
positional_deprecated
,
run_task_tests
,
make_table
import
torch
@
positional_deprecated
@
positional_deprecated
def
simple_evaluate
(
def
simple_evaluate
(
...
@@ -79,6 +79,7 @@ def simple_evaluate(
...
@@ -79,6 +79,7 @@ def simple_evaluate(
decontamination_ngrams_path
=
decontamination_ngrams_path
,
decontamination_ngrams_path
=
decontamination_ngrams_path
,
)
)
if
lm
.
rank
==
0
:
# add info about the model and few shot config
# add info about the model and few shot config
results
[
"config"
]
=
{
results
[
"config"
]
=
{
"model"
:
model
,
"model"
:
model
,
...
@@ -92,6 +93,9 @@ def simple_evaluate(
...
@@ -92,6 +93,9 @@ def simple_evaluate(
}
}
return
results
return
results
else
:
return
None
decontaminate_suffix
=
"_decontaminate"
decontaminate_suffix
=
"_decontaminate"
...
@@ -143,11 +147,21 @@ def evaluate(
...
@@ -143,11 +147,21 @@ def evaluate(
# rnd.shuffle(task_docs)
# rnd.shuffle(task_docs)
# for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
# for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
task
.
build_all_requests
(
limit
=
limit
)
task
.
build_all_requests
(
limit
=
limit
,
rank
=
lm
.
rank
,
world_size
=
lm
.
world_size
)
# aggregate Instances by LM method requested to get output.
# aggregate Instances by LM method requested to get output.
reqtype
=
"loglikelihood"
if
task
.
OUTPUT_TYPE
==
"multiple_choice"
else
task
.
OUTPUT_TYPE
#TODO: this is hacky, fix in task.py
reqtype
=
"loglikelihood"
if
task
.
OUTPUT_TYPE
==
"multiple_choice"
else
task
.
OUTPUT_TYPE
#TODO: this is hacky, fix in task.py
requests
[
reqtype
].
extend
(
task
.
instances
)
requests
[
reqtype
].
extend
(
task
.
instances
)
if
lm
.
world_size
>
1
:
instances_rnk
=
torch
.
tensor
(
len
(
task
.
_instances
),
device
=
lm
.
device
)
gathered_item
=
lm
.
accelerator
.
gather
(
instances_rnk
).
cpu
().
detach
().
numpy
().
tolist
()
# compute number of pseudobatches to pad with (FSDP/DDP require even batches + can't use join)
# we assume rank 0 always has largest iterator
numpad
=
gathered_item
[
0
]
-
gathered_item
[
lm
.
rank
]
if
numpad
>
0
:
print
(
f
"
{
task_name
}
/ balancing iterators across ranks / rank:
{
lm
.
rank
}
/ +
{
numpad
}
sample"
)
### Run LM on inputs, get all outputs ###
### Run LM on inputs, get all outputs ###
# execute each type of request
# execute each type of request
for
reqtype
,
reqs
in
requests
.
items
():
for
reqtype
,
reqs
in
requests
.
items
():
...
@@ -157,6 +171,10 @@ def evaluate(
...
@@ -157,6 +171,10 @@ def evaluate(
for
req
in
reqs
:
for
req
in
reqs
:
cloned_reqs
.
extend
([
req
]
*
req
.
repeats
)
cloned_reqs
.
extend
([
req
]
*
req
.
repeats
)
if
(
lm
.
rank
>
0
)
and
(
numpad
>
0
):
for
_
in
range
(
numpad
):
cloned_reqs
.
extend
([
req
]
*
req
.
repeats
)
# run requests through model
# run requests through model
resps
=
getattr
(
lm
,
reqtype
)(
cloned_reqs
)
resps
=
getattr
(
lm
,
reqtype
)(
cloned_reqs
)
...
@@ -164,6 +182,9 @@ def evaluate(
...
@@ -164,6 +182,9 @@ def evaluate(
for
x
,
req
in
zip
(
resps
,
cloned_reqs
):
for
x
,
req
in
zip
(
resps
,
cloned_reqs
):
req
.
resps
.
append
(
x
)
req
.
resps
.
append
(
x
)
if
lm
.
world_size
>
1
:
lm
.
accelerator
.
wait_for_everyone
()
### Postprocess outputs ###
### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
...
@@ -187,8 +208,41 @@ def evaluate(
...
@@ -187,8 +208,41 @@ def evaluate(
for
metric
,
value
in
metrics
.
items
():
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
if
lm
.
world_size
>
1
:
# if multigpu, then gather data across all ranks
vals_torch
=
collections
.
defaultdict
(
list
)
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
numitem
=
0
if
type
(
items
[
0
])
==
tuple
:
numitem
=
len
(
items
[
0
])
# distributed gather requires all ranks to have same dimensionality -> pad out with float32 min value
pad_value
=
torch
.
finfo
(
torch
.
float32
).
min
metrics_tensor
=
torch
.
tensor
(
items
,
device
=
lm
.
device
)
original_dtype
=
metrics_tensor
.
dtype
# store original dtype
torch_device_tensor
=
lm
.
accelerator
.
pad_across_processes
(
metrics_tensor
.
to
(
torch
.
float32
),
pad_index
=
pad_value
)
gathered_item
=
lm
.
accelerator
.
gather
(
torch_device_tensor
)
#TODO: This is required when we get a tensor with a tuple of info like (ppl, _bytes) from wikitext
if
numitem
>
0
:
gathered_filtered
=
gathered_item
[
gathered_item
[:,
0
]
!=
pad_value
]
else
:
gathered_filtered
=
gathered_item
[
gathered_item
!=
pad_value
]
gathered_item
=
gathered_filtered
.
to
(
original_dtype
).
cpu
().
detach
().
numpy
().
tolist
()
# reconvert if we were passed a tuple of values
if
numitem
>
0
:
gathered_item
=
[
tuple
(
g
)
for
g
in
gathered_item
]
if
lm
.
rank
==
0
:
vals_torch
[(
task_name
,
key
,
metric
)]
=
gathered_item
vals
=
vals_torch
if
lm
.
rank
==
0
:
### Aggregate results over all datapoints ###
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
# aggregate results ; run bootstrap CIs
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
...
@@ -209,3 +263,6 @@ def evaluate(
...
@@ -209,3 +263,6 @@ def evaluate(
results
[
task_name
][
metric
+
" - filter="
+
key
+
"_stderr"
]
=
stderr
(
items
)
results
[
task_name
][
metric
+
" - filter="
+
key
+
"_stderr"
]
=
stderr
(
items
)
return
{
"results"
:
dict
(
results
),
"versions"
:
dict
(
versions
)}
return
{
"results"
:
dict
(
results
),
"versions"
:
dict
(
versions
)}
else
:
return
None
lm_eval/models/gpt2.py
View file @
d4c5315a
...
@@ -8,6 +8,8 @@ import torch.nn.functional as F
...
@@ -8,6 +8,8 @@ import torch.nn.functional as F
from
lm_eval
import
utils
from
lm_eval
import
utils
from
lm_eval.api.model
import
LM
,
register_model
from
lm_eval.api.model
import
LM
,
register_model
from
accelerate
import
Accelerator
from
itertools
import
islice
@
register_model
(
"hf-causal"
,
"gpt2"
)
@
register_model
(
"hf-causal"
,
"gpt2"
)
class
HFLM
(
LM
):
class
HFLM
(
LM
):
...
@@ -27,6 +29,8 @@ class HFLM(LM):
...
@@ -27,6 +29,8 @@ class HFLM(LM):
assert
isinstance
(
pretrained
,
str
)
assert
isinstance
(
pretrained
,
str
)
assert
isinstance
(
batch_size
,
int
)
assert
isinstance
(
batch_size
,
int
)
gpus
=
torch
.
cuda
.
device_count
()
if
gpus
<=
1
:
if
device
:
if
device
:
if
device
not
in
[
"cuda"
,
"cpu"
]:
if
device
not
in
[
"cuda"
,
"cpu"
]:
device
=
int
(
device
)
device
=
int
(
device
)
...
@@ -59,10 +63,17 @@ class HFLM(LM):
...
@@ -59,10 +63,17 @@ class HFLM(LM):
# multithreading and batching
# multithreading and batching
self
.
batch_size_per_gpu
=
batch_size
# todo: adaptive batch size
self
.
batch_size_per_gpu
=
batch_size
# todo: adaptive batch size
# TODO: fix multi-gpu
if
gpus
>
1
:
# gpus = torch.cuda.device_count()
accelerator
=
Accelerator
(
device_placement
=
False
)
# if gpus > 1:
self
.
gpt2
=
accelerator
.
prepare
(
self
.
gpt2
)
# self.gpt2 = nn.DataParallel(self.gpt2)
self
.
_device
=
torch
.
device
(
f
"cuda:
{
accelerator
.
local_process_index
}
"
)
self
.
accelerator
=
accelerator
if
self
.
accelerator
.
is_local_main_process
:
print
(
f
"Using
{
gpus
}
GPUs with FullyShardedDataParalell and accelerate"
)
self
.
_rank
=
self
.
accelerator
.
local_process_index
self
.
_world_size
=
gpus
@
property
@
property
def
eot_token_id
(
self
):
def
eot_token_id
(
self
):
...
@@ -91,6 +102,14 @@ class HFLM(LM):
...
@@ -91,6 +102,14 @@ class HFLM(LM):
# TODO: fix multi-gpu
# TODO: fix multi-gpu
return
self
.
_device
return
self
.
_device
@
property
def
rank
(
self
):
return
self
.
_rank
@
property
def
world_size
(
self
):
return
self
.
_world_size
def
tok_encode
(
self
,
string
:
str
):
def
tok_encode
(
self
,
string
:
str
):
return
self
.
tokenizer
.
encode
(
string
,
add_special_tokens
=
False
)
return
self
.
tokenizer
.
encode
(
string
,
add_special_tokens
=
False
)
...
...
main.py
View file @
d4c5315a
...
@@ -89,6 +89,7 @@ def main():
...
@@ -89,6 +89,7 @@ def main():
print
(
f
"Selected Tasks:
{
task_names
}
"
)
print
(
f
"Selected Tasks:
{
task_names
}
"
)
if
results
is
not
None
:
results
=
evaluator
.
simple_evaluate
(
results
=
evaluator
.
simple_evaluate
(
model
=
args
.
model
,
model
=
args
.
model
,
model_args
=
args
.
model_args
,
model_args
=
args
.
model_args
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment