sync working changes with upstream

d4c5315a · Benjamin Fattori · 2da74953 · d4c5315a · d4c5315a · d4c5315a
Commit d4c5315a authored May 05, 2023 by Benjamin Fattori
Showing with 153 additions and 75 deletions

lm_eval/api/task.py lm_eval/api/task.py +4 -3

lm_eval/evaluator.py lm_eval/evaluator.py +87 -30

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +37 -18

main.py main.py +25 -24

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -248,7 +248,7 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass
-    def build_all_requests(self, limit=None):
+    def build_all_requests(self, limit=None, rank=None, world_size=None):
        """Build a set of Instances for a task, and store them in task.instances"""
        if self.has_test_docs():
            docs = self.test_docs()
@@ -260,8 +260,9 @@ class Task(abc.ABC):
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
        instances = []
-        for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
+        # for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
-            # sample fewshot context
+        for doc_id, doc in itertools.islice(enumerate(docs), rank, None, world_size):
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
            fewshot_ctx = self.fewshot_context(
                doc, self._config.num_fewshot, rnd=random.Random()
            )

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -7,7 +7,7 @@ import lm_eval.models
 import lm_eval.tasks
 import lm_eval.api
 from lm_eval.utils import positional_deprecated, run_task_tests, make_table
+import torch 
 @positional_deprecated
 def simple_evaluate(
@@ -79,19 +79,23 @@ def simple_evaluate(
        decontamination_ngrams_path=decontamination_ngrams_path,
    )
-    # add info about the model and few shot config
+    if lm.rank == 0:
-    results["config"] = {
+        # add info about the model and few shot config
-        "model": model,
+        results["config"] = {
-        "model_args": model_args,
+            "model": model,
-        "num_fewshot": num_fewshot,
+            "model_args": model_args,
-        "batch_size": batch_size,
+            "num_fewshot": num_fewshot,
-        "device": device,
+            "batch_size": batch_size,
-        "no_cache": no_cache,
+            "device": device,
-        "limit": limit,
+            "no_cache": no_cache,
-        "bootstrap_iters": bootstrap_iters,
+            "limit": limit,
-    }
+            "bootstrap_iters": bootstrap_iters,
+        }
+        return results
+    else:
+        return None
-    return results
 decontaminate_suffix = "_decontaminate"
@@ -143,10 +147,20 @@ def evaluate(
        # rnd.shuffle(task_docs)
        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
-        task.build_all_requests(limit=limit)
+        task.build_all_requests(limit=limit, rank = lm.rank, world_size = lm.world_size)
        # aggregate Instances by LM method requested to get output.
        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances) 
+        if lm.world_size > 1:
+            instances_rnk = torch.tensor(len(task._instances), device = lm.device)
+            gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
+            # compute number of pseudobatches to pad with (FSDP/DDP require even batches + can't use join)
+            # we assume rank 0 always has largest iterator
+            numpad = gathered_item[0] - gathered_item[lm.rank]
+            if numpad > 0:
+                print(f"{task_name} / balancing iterators across ranks / rank: {lm.rank} / + {numpad} sample")
    ### Run LM on inputs, get all outputs ###
    # execute each type of request
@@ -157,6 +171,10 @@ def evaluate(
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
+        if (lm.rank > 0) and (numpad > 0):
+            for _ in range(numpad):
+                cloned_reqs.extend([req] * req.repeats)
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)
@@ -164,6 +182,9 @@ def evaluate(
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)
+    if lm.world_size > 1:
+        lm.accelerator.wait_for_everyone()
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
@@ -187,25 +208,61 @@ def evaluate(
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
+    if lm.world_size > 1:
+        # if multigpu, then gather data across all ranks    
+        vals_torch = collections.defaultdict(list)
+        for (task_name, key, metric), items in vals.items():
+            numitem = 0 
+            if type(items[0]) == tuple:
+                numitem = len(items[0]) 
+            # distributed gather requires all ranks to have same dimensionality -> pad out with float32 min value
+            pad_value = torch.finfo(torch.float32).min
+            metrics_tensor = torch.tensor(items, device = lm.device)
+            original_dtype = metrics_tensor.dtype # store original dtype 
+            torch_device_tensor = lm.accelerator.pad_across_processes(metrics_tensor.to(torch.float32), pad_index = pad_value)
+            gathered_item = lm.accelerator.gather(torch_device_tensor)
+            #TODO: This is required when we get a tensor with a tuple of info like (ppl, _bytes) from wikitext
+            if numitem > 0:
+                gathered_filtered = gathered_item[gathered_item[:,0] != pad_value]
+            else:
+                gathered_filtered = gathered_item[gathered_item != pad_value]
+            gathered_item = gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+            # reconvert if we were passed a tuple of values
+            if numitem > 0:
+                gathered_item = [tuple(g) for g in gathered_item]
+            if lm.rank == 0:
+                vals_torch[(task_name, key, metric)] = gathered_item
+        vals = vals_torch
-    ### Aggregate results over all datapoints ###
+    if lm.rank == 0:
-    # aggregate results ; run bootstrap CIs
+        ### Aggregate results over all datapoints ###
-    for (task_name, key, metric), items in vals.items():
+        # aggregate results ; run bootstrap CIs
-        task = task_dict[task_name]
+        for (task_name, key, metric), items in vals.items():
-        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](items)
+            task = task_dict[task_name]
+            results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](items)
-        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
+            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
-        # so we run them less iterations. still looking for a cleaner way to do this
+            # so we run them less iterations. still looking for a cleaner way to do this
-        stderr = lm_eval.api.metrics.stderr_for_metric(
+            stderr = lm_eval.api.metrics.stderr_for_metric(
-            metric=task.aggregation()[metric],
+                metric=task.aggregation()[metric],
-            bootstrap_iters=min(bootstrap_iters, 1000)
+                bootstrap_iters=min(bootstrap_iters, 1000)
-            if metric in ["bleu", "chrf", "ter"]
+                if metric in ["bleu", "chrf", "ter"]
-            else bootstrap_iters,
+                else bootstrap_iters,
-        )
+            )
+            if stderr is not None:
+                results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
-        if stderr is not None:
+        return {"results": dict(results), "versions": dict(versions)}
-            results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
-    return {"results": dict(results), "versions": dict(versions)}
+    else:
+        return None
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -8,6 +8,8 @@ import torch.nn.functional as F
 from lm_eval import utils
 from lm_eval.api.model import LM, register_model
+from accelerate import Accelerator
+from itertools import islice
 @register_model("hf-causal", "gpt2")
 class HFLM(LM):
@@ -26,20 +28,22 @@ class HFLM(LM):
        assert isinstance(device, str)
        assert isinstance(pretrained, str)
        assert isinstance(batch_size, int)
-        if device:
+        gpus = torch.cuda.device_count()
-            if device not in ["cuda", "cpu"]:
+        if gpus <= 1:
-                device = int(device)
+            if device:  
-            self._device = torch.device(device)
+                if device not in ["cuda", "cpu"]:
-            print(f"Using device '{device}'")
+                    device = int(device)
-        else:
+                self._device = torch.device(device)
-            print("Device not specified")
+                print(f"Using device '{device}'")
-            print(f"Cuda Available? {torch.cuda.is_available()}")
+            else:
-            self._device = (
+                print("Device not specified")
-                torch.device("cuda")
+                print(f"Cuda Available? {torch.cuda.is_available()}")
-                if torch.cuda.is_available()
+                self._device = (
-                else torch.device("cpu")
+                    torch.device("cuda")
-            )
+                    if torch.cuda.is_available()
+                    else torch.device("cpu")
+                )
        # TODO: update this to be less of a hack once subfolder is fixed in HF
        revision = revision + ("/" + subfolder if subfolder is not None else "")
@@ -59,10 +63,17 @@ class HFLM(LM):
        # multithreading and batching
        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
-        # TODO: fix multi-gpu
+        if gpus > 1:
-        # gpus = torch.cuda.device_count()
+            accelerator = Accelerator(device_placement=False)
-        # if gpus > 1:
+            self.gpt2 = accelerator.prepare(self.gpt2)
-        #     self.gpt2 = nn.DataParallel(self.gpt2)
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                print(f"Using {gpus} GPUs with FullyShardedDataParalell and accelerate")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = gpus
    @property
    def eot_token_id(self):
@@ -90,6 +101,14 @@ class HFLM(LM):
    def device(self):
        # TODO: fix multi-gpu
        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)

--- a/main.py
+++ b/main.py
@@ -89,30 +89,31 @@ def main():
    print(f"Selected Tasks: {task_names}")
-    results = evaluator.simple_evaluate(
+    if results is not None:
-        model=args.model,
+        results = evaluator.simple_evaluate(
-        model_args=args.model_args,
+            model=args.model,
-        tasks=task_names,
+            model_args=args.model_args,
-        num_fewshot=args.num_fewshot,
+            tasks=task_names,
-        batch_size=args.batch_size,
+            num_fewshot=args.num_fewshot,
-        device=args.device,
+            batch_size=args.batch_size,
-        limit=args.limit,
+            device=args.device,
-        decontamination_ngrams_path=args.decontamination_ngrams_path,
+            limit=args.limit,
-        check_integrity=args.check_integrity,
+            decontamination_ngrams_path=args.decontamination_ngrams_path,
-    )
+            check_integrity=args.check_integrity,
+        )
-    dumped = json.dumps(results, indent=2)
-    print(dumped)
+        dumped = json.dumps(results, indent=2)
+        print(dumped)
-    if args.output_path:
-        with open(args.output_path, "w") as f:
+        if args.output_path:
-            f.write(dumped)
+            with open(args.output_path, "w") as f:
+                f.write(dumped)
-    print(
-        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
+        print(
-        f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+            f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
-    )
+            f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
-    print(evaluator.make_table(results))
+        )
+        print(evaluator.make_table(results))
 if __name__ == "__main__":