Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
be95d945
Commit
be95d945
authored
Sep 13, 2023
by
Herbie Bradley
Browse files
Merge working changes in
parent
cbe4ecdc
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
102 additions
and
24 deletions
+102
-24
lm_eval/api/task.py
lm_eval/api/task.py
+12
-5
lm_eval/evaluator.py
lm_eval/evaluator.py
+79
-6
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+11
-13
No files found.
lm_eval/api/task.py
View file @
be95d945
...
...
@@ -641,6 +641,8 @@ class ConfigurableTask(Task):
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
# Test One Doc
# self.features = ["text", "meta"]
# return None
self
.
features
=
list
(
self
.
task_docs
.
features
.
keys
())
self
.
multiple_input
=
0
self
.
multiple_target
=
0
...
...
@@ -745,10 +747,11 @@ class ConfigurableTask(Task):
"num_fewshot > 0 but fewshot_split is None. "
"using preconfigured rule."
)
return
super
().
fewshot_docs
()
return
super
().
fewshot_docs
()
else
:
return
None
def
apply_filters
(
self
):
if
hasattr
(
self
,
"_filters"
):
for
f
in
self
.
_filters
:
f
.
apply
(
self
.
_instances
,
self
.
task_docs
)
...
...
@@ -829,6 +832,7 @@ class ConfigurableTask(Task):
return
doc
[
doc_to_target
]
else
:
target_string
=
utils
.
apply_template
(
doc_to_target
,
doc
)
# return target_string
if
target_string
.
isdigit
()
and
self
.
_config
.
doc_to_choice
is
not
None
:
return
ast
.
literal_eval
(
target_string
)
elif
(
...
...
@@ -953,7 +957,6 @@ class ConfigurableTask(Task):
)
def
process_results
(
self
,
doc
,
results
):
if
callable
(
self
.
config
.
process_results
):
return
self
.
config
.
process_results
(
doc
,
results
)
...
...
@@ -1094,7 +1097,9 @@ class ConfigurableTask(Task):
predictions
=
[
result
],
**
self
.
_metric_fn_kwargs
[
metric
],
)
except
TypeError
:
# TODO: this is hacky and I don't want to do it
except
(
TypeError
):
# TODO: this is hacky and I don't want to do it
result_score
=
self
.
_metric_fn_list
[
metric
](
[
gold_option
,
result
]
)
...
...
@@ -1113,7 +1118,9 @@ class ConfigurableTask(Task):
predictions
=
[
result
],
**
self
.
_metric_fn_kwargs
[
metric
],
)
except
TypeError
:
# needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
except
(
TypeError
):
# needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
result_score
=
self
.
_metric_fn_list
[
metric
]([
gold
,
result
])
if
isinstance
(
result_score
,
dict
):
# TODO: this handles the case where HF evaluate returns a dict.
...
...
lm_eval/evaluator.py
View file @
be95d945
...
...
@@ -5,9 +5,9 @@ import logging
import
random
import
sys
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
torch
from
accelerate.utils.operations
import
_gpu_gather
import
lm_eval.api
import
lm_eval.api.metrics
...
...
@@ -311,6 +311,7 @@ def evaluate(
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
for
key
in
task
.
instances
[
0
].
filtered_resps
.
keys
():
num_requests
=
0
doc_iterator
=
(
itertools
.
islice
(
enumerate
(
task
.
test_docs
()),
lm
.
rank
,
limit
,
lm
.
world_size
...
...
@@ -341,6 +342,59 @@ def evaluate(
samples
[
task_name
].
append
(
example
)
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
num_requests
+=
1
num_requests
=
torch
.
tensor
(
num_requests
,
device
=
lm
.
device
)
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
task
=
task_dict
[
task_name
]
if
type
(
task
)
==
tuple
:
group
,
task
=
task
task_score
=
task
.
aggregation
()[
metric
](
items
)
results
[
task_name
][
metric
+
","
+
key
]
=
task_score
# Need to put back in results
# pythia | acc
# | perplexity
# | word_perplexity
# | byte_perplexity
# | bits_per_byte
if
bool
(
task_groups
):
group_name
=
task_groups
[
task_name
]
if
metric
not
in
aggregate
[
group_name
]:
aggregate
[
group_name
][
metric
]
=
[
task_score
]
else
:
aggregate
[
group_name
][
metric
].
append
(
task_score
)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if
bootstrap_iters
>
0
:
stderr
=
lm_eval
.
api
.
metrics
.
stderr_for_metric
(
metric
=
task
.
aggregation
()[
metric
],
bootstrap_iters
=
min
(
bootstrap_iters
,
1000
)
if
metric
in
[
"bleu"
,
"chrf"
,
"ter"
]
else
bootstrap_iters
,
)
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
if
bool
(
aggregate
):
for
group
in
aggregate
.
keys
():
for
metric
in
aggregate
[
group
].
keys
():
aggregate
[
group
][
metric
]
=
np
.
average
(
aggregate
[
group
][
metric
])
versions
[
group
]
=
"N/A"
results_dict
=
{
"results"
:
dict
(
sorted
(
results
.
items
())),
**
({
"aggregate"
:
dict
(
sorted
(
aggregate
.
items
()))}
if
bool
(
aggregate
)
else
{}),
"configs"
:
dict
(
sorted
(
configs
.
items
())),
"versions"
:
dict
(
sorted
(
versions
.
items
())),
}
if
log_samples
:
results_dict
[
"samples"
]
=
dict
(
samples
)
print
(
"Rank: "
,
lm
.
rank
,
" Results: "
,
results_dict
)
if
lm
.
world_size
>
1
:
# if multigpu, then gather data across all ranks
...
...
@@ -369,17 +423,36 @@ def evaluate(
# so we pad out with float32 min value
pad_value
=
torch
.
finfo
(
torch
.
float32
).
min
metrics_tensor
=
torch
.
tensor
(
items
,
device
=
lm
.
device
)
original_dtype
=
metrics_tensor
.
dtype
# store original dtype
# Gather sizes
torch_device_tensor
=
lm
.
accelerator
.
pad_across_processes
(
metrics_tensor
.
to
(
torch
.
float32
),
pad_index
=
pad_value
)
gathered_item
=
lm
.
accelerator
.
gather
(
torch_device_tensor
)
metrics_tensor
.
to
(
torch
.
float32
),
pad_index
=
pad_value
)
gathered_item
=
lm
.
accelerator
.
gather
(
torch_device_tensor
)
if
numitem
>
0
:
gathered_filtered
=
gathered_item
[
gathered_item
[:,
0
]
!=
pad_value
]
else
:
gathered_filtered
=
gathered_item
[
gathered_item
!=
pad_value
]
# gathered_sizes = lm.accelerator.gather(num_requests)
# sizes = torch.stack(output_tensors)
# if lm.rank == 0:
# print(gathered_sizes)
# max_size = 26834
# # Use max size to pad
# metrics_tensor = metrics_tensor.to(torch.float32)
# if max_size != metrics_tensor.shape[0]:
# old_size = metrics_tensor.shape
# new_size = list(old_size)
# new_size[0] = max_size
# device_tensor = metrics_tensor.new_zeros(tuple(new_size)) + pad_value
# indices = tuple(
# slice(0, old_size[0]) if i == 0 else slice(None)
# for i in range(len(new_size))
# )
# device_tensor[indices] = metrics_tensor
# else:
# device_tensor = metrics_tensor
# gathered_item = lm.accelerator.gather(device_tensor)
gathered_item
=
(
gathered_filtered
.
to
(
original_dtype
).
cpu
().
detach
().
numpy
().
tolist
()
...
...
lm_eval/models/huggingface.py
View file @
be95d945
import
os
from
typing
import
List
,
Optional
,
Union
import
torch
import
torch.nn.functional
as
F
import
transformers
from
accelerate
import
Accelerator
,
DistributedType
,
find_executable_batch_size
from
peft
import
PeftModel
from
peft
import
__version__
as
PEFT_VERSION
from
tqdm
import
tqdm
from
transformers.models.auto.modeling_auto
import
(
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
,
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
,
)
from
peft
import
__version__
as
PEFT_VERSION
,
PeftModel
import
copy
from
collections
import
defaultdict
from
tqdm
import
tqdm
from
pathlib
import
Path
import
torch.nn.functional
as
F
from
lm_eval
import
utils
from
lm_eval.logger
import
eval_logger
from
lm_eval.api.model
import
LM
from
lm_eval.api.registry
import
register_model
from
lm_eval.logger
import
eval_logger
from
lm_eval.utils
import
MultiTokenEOSCriteria
,
stop_sequences_criteria
from
accelerate
import
Accelerator
,
find_executable_batch_size
,
DistributedType
from
typing
import
List
,
Optional
,
Union
def
_get_accelerate_args
(
device_map_option
:
Optional
[
str
]
=
"auto"
,
...
...
@@ -569,6 +563,10 @@ class HFLM(LM):
adaptive_batch_size
=
batch_size
for
(
string
,)
in
tqdm
([
req
.
args
for
req
in
requests
],
disable
=
(
self
.
rank
!=
0
)):
if
len
(
string
)
==
0
:
loglikelihoods
.
append
(
float
(
"-inf"
))
continue
rolling_token_windows
=
list
(
map
(
utils
.
make_disjoint_window
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment