Commit be95d945 authored by Herbie Bradley's avatar Herbie Bradley
Browse files

Merge working changes in

parent cbe4ecdc
......@@ -641,6 +641,8 @@ class ConfigurableTask(Task):
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
# Test One Doc
# self.features = ["text", "meta"]
# return None
self.features = list(self.task_docs.features.keys())
self.multiple_input = 0
self.multiple_target = 0
......@@ -745,10 +747,11 @@ class ConfigurableTask(Task):
"num_fewshot > 0 but fewshot_split is None. "
"using preconfigured rule."
)
return super().fewshot_docs()
return super().fewshot_docs()
else:
return None
def apply_filters(self):
if hasattr(self, "_filters"):
for f in self._filters:
f.apply(self._instances, self.task_docs)
......@@ -829,6 +832,7 @@ class ConfigurableTask(Task):
return doc[doc_to_target]
else:
target_string = utils.apply_template(doc_to_target, doc)
# return target_string
if target_string.isdigit() and self._config.doc_to_choice is not None:
return ast.literal_eval(target_string)
elif (
......@@ -953,7 +957,6 @@ class ConfigurableTask(Task):
)
def process_results(self, doc, results):
if callable(self.config.process_results):
return self.config.process_results(doc, results)
......@@ -1094,7 +1097,9 @@ class ConfigurableTask(Task):
predictions=[result],
**self._metric_fn_kwargs[metric],
)
except TypeError: # TODO: this is hacky and I don't want to do it
except (
TypeError
): # TODO: this is hacky and I don't want to do it
result_score = self._metric_fn_list[metric](
[gold_option, result]
)
......@@ -1113,7 +1118,9 @@ class ConfigurableTask(Task):
predictions=[result],
**self._metric_fn_kwargs[metric],
)
except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
except (
TypeError
): # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
result_score = self._metric_fn_list[metric]([gold, result])
if isinstance(result_score, dict):
# TODO: this handles the case where HF evaluate returns a dict.
......
......@@ -5,9 +5,9 @@ import logging
import random
import sys
import matplotlib.pyplot as plt
import numpy as np
import torch
from accelerate.utils.operations import _gpu_gather
import lm_eval.api
import lm_eval.api.metrics
......@@ -311,6 +311,7 @@ def evaluate(
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
for key in task.instances[0].filtered_resps.keys():
num_requests = 0
doc_iterator = (
itertools.islice(
enumerate(task.test_docs()), lm.rank, limit, lm.world_size
......@@ -341,6 +342,59 @@ def evaluate(
samples[task_name].append(example)
for metric, value in metrics.items():
vals[(task_name, key, metric)].append(value)
num_requests += 1
num_requests = torch.tensor(num_requests, device=lm.device)
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
task = task_dict[task_name]
if type(task) == tuple:
group, task = task
task_score = task.aggregation()[metric](items)
results[task_name][metric + "," + key] = task_score
# Need to put back in results
# pythia | acc
# | perplexity
# | word_perplexity
# | byte_perplexity
# | bits_per_byte
if bool(task_groups):
group_name = task_groups[task_name]
if metric not in aggregate[group_name]:
aggregate[group_name][metric] = [task_score]
else:
aggregate[group_name][metric].append(task_score)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
if bool(aggregate):
for group in aggregate.keys():
for metric in aggregate[group].keys():
aggregate[group][metric] = np.average(aggregate[group][metric])
versions[group] = "N/A"
results_dict = {
"results": dict(sorted(results.items())),
**({"aggregate": dict(sorted(aggregate.items()))} if bool(aggregate) else {}),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
}
if log_samples:
results_dict["samples"] = dict(samples)
print("Rank: ", lm.rank, " Results: ", results_dict)
if lm.world_size > 1:
# if multigpu, then gather data across all ranks
......@@ -369,17 +423,36 @@ def evaluate(
# so we pad out with float32 min value
pad_value = torch.finfo(torch.float32).min
metrics_tensor = torch.tensor(items, device=lm.device)
original_dtype = metrics_tensor.dtype # store original dtype
# Gather sizes
torch_device_tensor = lm.accelerator.pad_across_processes(
metrics_tensor.to(torch.float32), pad_index=pad_value
)
gathered_item = lm.accelerator.gather(torch_device_tensor)
metrics_tensor.to(torch.float32), pad_index=pad_value
)
gathered_item = lm.accelerator.gather(torch_device_tensor)
if numitem > 0:
gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
else:
gathered_filtered = gathered_item[gathered_item != pad_value]
# gathered_sizes = lm.accelerator.gather(num_requests)
# sizes = torch.stack(output_tensors)
# if lm.rank == 0:
# print(gathered_sizes)
# max_size = 26834
# # Use max size to pad
# metrics_tensor = metrics_tensor.to(torch.float32)
# if max_size != metrics_tensor.shape[0]:
# old_size = metrics_tensor.shape
# new_size = list(old_size)
# new_size[0] = max_size
# device_tensor = metrics_tensor.new_zeros(tuple(new_size)) + pad_value
# indices = tuple(
# slice(0, old_size[0]) if i == 0 else slice(None)
# for i in range(len(new_size))
# )
# device_tensor[indices] = metrics_tensor
# else:
# device_tensor = metrics_tensor
# gathered_item = lm.accelerator.gather(device_tensor)
gathered_item = (
gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
......
import os
from typing import List, Optional, Union
import torch
import torch.nn.functional as F
import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
from tqdm import tqdm
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
)
from peft import __version__ as PEFT_VERSION, PeftModel
import copy
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
import torch.nn.functional as F
from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.logger import eval_logger
from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
......@@ -569,6 +563,10 @@ class HFLM(LM):
adaptive_batch_size = batch_size
for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
if len(string) == 0:
loglikelihoods.append(float("-inf"))
continue
rolling_token_windows = list(
map(
utils.make_disjoint_window,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment