Commit be95d945 authored by Herbie Bradley's avatar Herbie Bradley
Browse files

Merge working changes in

parent cbe4ecdc
...@@ -641,6 +641,8 @@ class ConfigurableTask(Task): ...@@ -641,6 +641,8 @@ class ConfigurableTask(Task):
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
# Test One Doc # Test One Doc
# self.features = ["text", "meta"]
# return None
self.features = list(self.task_docs.features.keys()) self.features = list(self.task_docs.features.keys())
self.multiple_input = 0 self.multiple_input = 0
self.multiple_target = 0 self.multiple_target = 0
...@@ -746,9 +748,10 @@ class ConfigurableTask(Task): ...@@ -746,9 +748,10 @@ class ConfigurableTask(Task):
"using preconfigured rule." "using preconfigured rule."
) )
return super().fewshot_docs() return super().fewshot_docs()
else:
return None
def apply_filters(self): def apply_filters(self):
if hasattr(self, "_filters"): if hasattr(self, "_filters"):
for f in self._filters: for f in self._filters:
f.apply(self._instances, self.task_docs) f.apply(self._instances, self.task_docs)
...@@ -829,6 +832,7 @@ class ConfigurableTask(Task): ...@@ -829,6 +832,7 @@ class ConfigurableTask(Task):
return doc[doc_to_target] return doc[doc_to_target]
else: else:
target_string = utils.apply_template(doc_to_target, doc) target_string = utils.apply_template(doc_to_target, doc)
# return target_string
if target_string.isdigit() and self._config.doc_to_choice is not None: if target_string.isdigit() and self._config.doc_to_choice is not None:
return ast.literal_eval(target_string) return ast.literal_eval(target_string)
elif ( elif (
...@@ -953,7 +957,6 @@ class ConfigurableTask(Task): ...@@ -953,7 +957,6 @@ class ConfigurableTask(Task):
) )
def process_results(self, doc, results): def process_results(self, doc, results):
if callable(self.config.process_results): if callable(self.config.process_results):
return self.config.process_results(doc, results) return self.config.process_results(doc, results)
...@@ -1094,7 +1097,9 @@ class ConfigurableTask(Task): ...@@ -1094,7 +1097,9 @@ class ConfigurableTask(Task):
predictions=[result], predictions=[result],
**self._metric_fn_kwargs[metric], **self._metric_fn_kwargs[metric],
) )
except TypeError: # TODO: this is hacky and I don't want to do it except (
TypeError
): # TODO: this is hacky and I don't want to do it
result_score = self._metric_fn_list[metric]( result_score = self._metric_fn_list[metric](
[gold_option, result] [gold_option, result]
) )
...@@ -1113,7 +1118,9 @@ class ConfigurableTask(Task): ...@@ -1113,7 +1118,9 @@ class ConfigurableTask(Task):
predictions=[result], predictions=[result],
**self._metric_fn_kwargs[metric], **self._metric_fn_kwargs[metric],
) )
except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics except (
TypeError
): # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
result_score = self._metric_fn_list[metric]([gold, result]) result_score = self._metric_fn_list[metric]([gold, result])
if isinstance(result_score, dict): if isinstance(result_score, dict):
# TODO: this handles the case where HF evaluate returns a dict. # TODO: this handles the case where HF evaluate returns a dict.
......
...@@ -5,9 +5,9 @@ import logging ...@@ -5,9 +5,9 @@ import logging
import random import random
import sys import sys
import matplotlib.pyplot as plt
import numpy as np import numpy as np
import torch import torch
from accelerate.utils.operations import _gpu_gather
import lm_eval.api import lm_eval.api
import lm_eval.api.metrics import lm_eval.api.metrics
...@@ -311,6 +311,7 @@ def evaluate( ...@@ -311,6 +311,7 @@ def evaluate(
# TODO: make it possible to use a different metric per filter # TODO: make it possible to use a different metric per filter
# iterate over different filters used # iterate over different filters used
for key in task.instances[0].filtered_resps.keys(): for key in task.instances[0].filtered_resps.keys():
num_requests = 0
doc_iterator = ( doc_iterator = (
itertools.islice( itertools.islice(
enumerate(task.test_docs()), lm.rank, limit, lm.world_size enumerate(task.test_docs()), lm.rank, limit, lm.world_size
...@@ -341,6 +342,59 @@ def evaluate( ...@@ -341,6 +342,59 @@ def evaluate(
samples[task_name].append(example) samples[task_name].append(example)
for metric, value in metrics.items(): for metric, value in metrics.items():
vals[(task_name, key, metric)].append(value) vals[(task_name, key, metric)].append(value)
num_requests += 1
num_requests = torch.tensor(num_requests, device=lm.device)
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
task = task_dict[task_name]
if type(task) == tuple:
group, task = task
task_score = task.aggregation()[metric](items)
results[task_name][metric + "," + key] = task_score
# Need to put back in results
# pythia | acc
# | perplexity
# | word_perplexity
# | byte_perplexity
# | bits_per_byte
if bool(task_groups):
group_name = task_groups[task_name]
if metric not in aggregate[group_name]:
aggregate[group_name][metric] = [task_score]
else:
aggregate[group_name][metric].append(task_score)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
if bool(aggregate):
for group in aggregate.keys():
for metric in aggregate[group].keys():
aggregate[group][metric] = np.average(aggregate[group][metric])
versions[group] = "N/A"
results_dict = {
"results": dict(sorted(results.items())),
**({"aggregate": dict(sorted(aggregate.items()))} if bool(aggregate) else {}),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
}
if log_samples:
results_dict["samples"] = dict(samples)
print("Rank: ", lm.rank, " Results: ", results_dict)
if lm.world_size > 1: if lm.world_size > 1:
# if multigpu, then gather data across all ranks # if multigpu, then gather data across all ranks
...@@ -369,17 +423,36 @@ def evaluate( ...@@ -369,17 +423,36 @@ def evaluate(
# so we pad out with float32 min value # so we pad out with float32 min value
pad_value = torch.finfo(torch.float32).min pad_value = torch.finfo(torch.float32).min
metrics_tensor = torch.tensor(items, device=lm.device) metrics_tensor = torch.tensor(items, device=lm.device)
original_dtype = metrics_tensor.dtype # store original dtype original_dtype = metrics_tensor.dtype # store original dtype
# Gather sizes
torch_device_tensor = lm.accelerator.pad_across_processes( torch_device_tensor = lm.accelerator.pad_across_processes(
metrics_tensor.to(torch.float32), pad_index=pad_value metrics_tensor.to(torch.float32), pad_index=pad_value
) )
gathered_item = lm.accelerator.gather(torch_device_tensor) gathered_item = lm.accelerator.gather(torch_device_tensor)
if numitem > 0: if numitem > 0:
gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value] gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
else: else:
gathered_filtered = gathered_item[gathered_item != pad_value] gathered_filtered = gathered_item[gathered_item != pad_value]
# gathered_sizes = lm.accelerator.gather(num_requests)
# sizes = torch.stack(output_tensors)
# if lm.rank == 0:
# print(gathered_sizes)
# max_size = 26834
# # Use max size to pad
# metrics_tensor = metrics_tensor.to(torch.float32)
# if max_size != metrics_tensor.shape[0]:
# old_size = metrics_tensor.shape
# new_size = list(old_size)
# new_size[0] = max_size
# device_tensor = metrics_tensor.new_zeros(tuple(new_size)) + pad_value
# indices = tuple(
# slice(0, old_size[0]) if i == 0 else slice(None)
# for i in range(len(new_size))
# )
# device_tensor[indices] = metrics_tensor
# else:
# device_tensor = metrics_tensor
# gathered_item = lm.accelerator.gather(device_tensor)
gathered_item = ( gathered_item = (
gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist() gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
......
import os import os
from typing import List, Optional, Union
import torch import torch
import torch.nn.functional as F
import transformers import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
from tqdm import tqdm
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
) )
from peft import __version__ as PEFT_VERSION, PeftModel
import copy
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
import torch.nn.functional as F
from lm_eval import utils from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.logger import eval_logger
from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union
def _get_accelerate_args( def _get_accelerate_args(
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
...@@ -569,6 +563,10 @@ class HFLM(LM): ...@@ -569,6 +563,10 @@ class HFLM(LM):
adaptive_batch_size = batch_size adaptive_batch_size = batch_size
for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)): for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
if len(string) == 0:
loglikelihoods.append(float("-inf"))
continue
rolling_token_windows = list( rolling_token_windows = list(
map( map(
utils.make_disjoint_window, utils.make_disjoint_window,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment