Merge working changes in

be95d945 · Herbie Bradley · cbe4ecdc · be95d945 · be95d945 · be95d945
Commit be95d945 authored Sep 13, 2023 by Herbie Bradley
Hide whitespace changes
Inline Side-by-side

Showing with 102 additions and 24 deletions

lm_eval/api/task.py lm_eval/api/task.py +12 -5

lm_eval/evaluator.py lm_eval/evaluator.py +79 -6

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +11 -13

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -641,6 +641,8 @@ class ConfigurableTask(Task):
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        # Test One Doc
+        # self.features = ["text", "meta"]
+        # return None
        self.features = list(self.task_docs.features.keys())
        self.multiple_input = 0
        self.multiple_target = 0
@@ -745,10 +747,11 @@ class ConfigurableTask(Task):
                    "num_fewshot > 0 but fewshot_split is None. "
                    "using preconfigured rule."
                )
-            return super().fewshot_docs()
+                return super().fewshot_docs()
+            else:
+                return None

    def apply_filters(self):
-
        if hasattr(self, "_filters"):
            for f in self._filters:
                f.apply(self._instances, self.task_docs)
@@ -829,6 +832,7 @@ class ConfigurableTask(Task):
                return doc[doc_to_target]
            else:
                target_string = utils.apply_template(doc_to_target, doc)
+                # return target_string
                if target_string.isdigit() and self._config.doc_to_choice is not None:
                    return ast.literal_eval(target_string)
                elif (
@@ -953,7 +957,6 @@ class ConfigurableTask(Task):
        )

    def process_results(self, doc, results):
-
        if callable(self.config.process_results):
            return self.config.process_results(doc, results)

@@ -1094,7 +1097,9 @@ class ConfigurableTask(Task):
                                predictions=[result],
                                **self._metric_fn_kwargs[metric],
                            )
-                        except TypeError:  # TODO: this is hacky and I don't want to do it
+                        except (
+                            TypeError
+                        ):  # TODO: this is hacky and I don't want to do it
                            result_score = self._metric_fn_list[metric](
                                [gold_option, result]
                            )
@@ -1113,7 +1118,9 @@ class ConfigurableTask(Task):
                            predictions=[result],
                            **self._metric_fn_kwargs[metric],
                        )
-                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                    except (
+                        TypeError
+                    ):  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
                        result_score = self._metric_fn_list[metric]([gold, result])
                    if isinstance(result_score, dict):
                        # TODO: this handles the case where HF evaluate returns a dict.

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -5,9 +5,9 @@ import logging
 import random
 import sys

-import matplotlib.pyplot as plt
 import numpy as np
 import torch
+from accelerate.utils.operations import _gpu_gather

 import lm_eval.api
 import lm_eval.api.metrics
@@ -311,6 +311,7 @@ def evaluate(
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
        for key in task.instances[0].filtered_resps.keys():
+            num_requests = 0
            doc_iterator = (
                itertools.islice(
                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
@@ -341,6 +342,59 @@ def evaluate(
                    samples[task_name].append(example)
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
+                num_requests += 1
+        num_requests = torch.tensor(num_requests, device=lm.device)
+
+    ### Aggregate results over all datapoints ###
+    # aggregate results ; run bootstrap CIs
+    for (task_name, key, metric), items in vals.items():
+        task = task_dict[task_name]
+        if type(task) == tuple:
+            group, task = task
+        task_score = task.aggregation()[metric](items)
+        results[task_name][metric + "," + key] = task_score
+
+        # Need to put back in results
+        # pythia | acc
+        #        | perplexity
+        #        | word_perplexity
+        #        | byte_perplexity
+        #        | bits_per_byte
+        if bool(task_groups):
+            group_name = task_groups[task_name]
+            if metric not in aggregate[group_name]:
+                aggregate[group_name][metric] = [task_score]
+            else:
+                aggregate[group_name][metric].append(task_score)
+
+        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
+        # so we run them less iterations. still looking for a cleaner way to do this
+        if bootstrap_iters > 0:
+            stderr = lm_eval.api.metrics.stderr_for_metric(
+                metric=task.aggregation()[metric],
+                bootstrap_iters=min(bootstrap_iters, 1000)
+                if metric in ["bleu", "chrf", "ter"]
+                else bootstrap_iters,
+            )
+
+            if stderr is not None:
+                results[task_name][metric + "_stderr" + "," + key] = stderr(items)
+
+    if bool(aggregate):
+        for group in aggregate.keys():
+            for metric in aggregate[group].keys():
+                aggregate[group][metric] = np.average(aggregate[group][metric])
+                versions[group] = "N/A"
+
+    results_dict = {
+        "results": dict(sorted(results.items())),
+        **({"aggregate": dict(sorted(aggregate.items()))} if bool(aggregate) else {}),
+        "configs": dict(sorted(configs.items())),
+        "versions": dict(sorted(versions.items())),
+    }
+    if log_samples:
+        results_dict["samples"] = dict(samples)
+    print("Rank: ", lm.rank, " Results: ", results_dict)

    if lm.world_size > 1:
        # if multigpu, then gather data across all ranks
@@ -369,17 +423,36 @@ def evaluate(
                # so we pad out with float32 min value
                pad_value = torch.finfo(torch.float32).min
                metrics_tensor = torch.tensor(items, device=lm.device)
-
                original_dtype = metrics_tensor.dtype  # store original dtype
+                # Gather sizes
                torch_device_tensor = lm.accelerator.pad_across_processes(
-                    metrics_tensor.to(torch.float32), pad_index=pad_value
-                )
-                gathered_item = lm.accelerator.gather(torch_device_tensor)
-
+                        metrics_tensor.to(torch.float32), pad_index=pad_value
+                    )
+                    gathered_item = lm.accelerator.gather(torch_device_tensor)
                if numitem > 0:
                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
                else:
                    gathered_filtered = gathered_item[gathered_item != pad_value]
+            # gathered_sizes = lm.accelerator.gather(num_requests)
+            # sizes = torch.stack(output_tensors)
+            # if lm.rank == 0:
+            #     print(gathered_sizes)
+            # max_size = 26834
+            # # Use max size to pad
+            # metrics_tensor = metrics_tensor.to(torch.float32)
+            # if max_size != metrics_tensor.shape[0]:
+            #     old_size = metrics_tensor.shape
+            #     new_size = list(old_size)
+            #     new_size[0] = max_size
+            #     device_tensor = metrics_tensor.new_zeros(tuple(new_size)) + pad_value
+            #     indices = tuple(
+            #         slice(0, old_size[0]) if i == 0 else slice(None)
+            #         for i in range(len(new_size))
+            #     )
+            #     device_tensor[indices] = metrics_tensor
+            # else:
+            #     device_tensor = metrics_tensor
+            # gathered_item = lm.accelerator.gather(device_tensor)

                gathered_item = (
                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import os
+from typing import List, Optional, Union

 import torch
+import torch.nn.functional as F
 import transformers
+from accelerate import Accelerator, DistributedType, find_executable_batch_size
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
-from peft import __version__ as PEFT_VERSION, PeftModel
-
-import copy
-from collections import defaultdict
-from tqdm import tqdm
-from pathlib import Path
-
-import torch.nn.functional as F

 from lm_eval import utils
-from lm_eval.logger import eval_logger
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-
+from lm_eval.logger import eval_logger
 from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria

-from accelerate import Accelerator, find_executable_batch_size, DistributedType
-from typing import List, Optional, Union
-

 def _get_accelerate_args(
    device_map_option: Optional[str] = "auto",
@@ -569,6 +563,10 @@ class HFLM(LM):
            adaptive_batch_size = batch_size

        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+            if len(string) == 0:
+                loglikelihoods.append(float("-inf"))
+                continue
+
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,