sync working changes with upstream

d4c5315a · Benjamin Fattori · 2da74953 · d4c5315a · d4c5315a · d4c5315a
Commit d4c5315a authored May 05, 2023 by Benjamin Fattori
Showing with 153 additions and 75 deletions

lm_eval/api/task.py lm_eval/api/task.py +4 -3

lm_eval/evaluator.py lm_eval/evaluator.py +87 -30

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +37 -18

main.py main.py +25 -24

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -248,7 +248,7 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass
-    def build_all_requests(self, limit=None):
+    def build_all_requests(self, limit=None, rank=None, world_size=None):
        """Build a set of Instances for a task, and store them in task.instances"""
        if self.has_test_docs():
            docs = self.test_docs()
@@ -260,8 +260,9 @@ class Task(abc.ABC):
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
        instances = []
-        for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
+        # for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
-            # sample fewshot context
+        for doc_id, doc in itertools.islice(enumerate(docs), rank, None, world_size):
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
            fewshot_ctx = self.fewshot_context(
                doc, self._config.num_fewshot, rnd=random.Random()
            )

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -7,7 +7,7 @@ import lm_eval.models
 import lm_eval.tasks
 import lm_eval.api
 from lm_eval.utils import positional_deprecated, run_task_tests, make_table
+import torch 
 @positional_deprecated
 def simple_evaluate(
@@ -79,6 +79,7 @@ def simple_evaluate(
        decontamination_ngrams_path=decontamination_ngrams_path,
    )
+    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
            "model": model,
@@ -92,6 +93,9 @@ def simple_evaluate(
        }
        return results
+    else:
+        return None
 decontaminate_suffix = "_decontaminate"
@@ -143,11 +147,21 @@ def evaluate(
        # rnd.shuffle(task_docs)
        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
-        task.build_all_requests(limit=limit)
+        task.build_all_requests(limit=limit, rank = lm.rank, world_size = lm.world_size)
        # aggregate Instances by LM method requested to get output.
        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances) 
+        if lm.world_size > 1:
+            instances_rnk = torch.tensor(len(task._instances), device = lm.device)
+            gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
+            # compute number of pseudobatches to pad with (FSDP/DDP require even batches + can't use join)
+            # we assume rank 0 always has largest iterator
+            numpad = gathered_item[0] - gathered_item[lm.rank]
+            if numpad > 0:
+                print(f"{task_name} / balancing iterators across ranks / rank: {lm.rank} / + {numpad} sample")
    ### Run LM on inputs, get all outputs ###
    # execute each type of request
    for reqtype, reqs in requests.items():
@@ -157,6 +171,10 @@ def evaluate(
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
+        if (lm.rank > 0) and (numpad > 0):
+            for _ in range(numpad):
+                cloned_reqs.extend([req] * req.repeats)
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)
@@ -164,6 +182,9 @@ def evaluate(
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)
+    if lm.world_size > 1:
+        lm.accelerator.wait_for_everyone()
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
@@ -187,8 +208,41 @@ def evaluate(
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
+    if lm.world_size > 1:
+        # if multigpu, then gather data across all ranks    
+        vals_torch = collections.defaultdict(list)
+        for (task_name, key, metric), items in vals.items():
+            numitem = 0 
+            if type(items[0]) == tuple:
+                numitem = len(items[0]) 
+            # distributed gather requires all ranks to have same dimensionality -> pad out with float32 min value
+            pad_value = torch.finfo(torch.float32).min
+            metrics_tensor = torch.tensor(items, device = lm.device)
+            original_dtype = metrics_tensor.dtype # store original dtype 
+            torch_device_tensor = lm.accelerator.pad_across_processes(metrics_tensor.to(torch.float32), pad_index = pad_value)
+            gathered_item = lm.accelerator.gather(torch_device_tensor)
+            #TODO: This is required when we get a tensor with a tuple of info like (ppl, _bytes) from wikitext
+            if numitem > 0:
+                gathered_filtered = gathered_item[gathered_item[:,0] != pad_value]
+            else:
+                gathered_filtered = gathered_item[gathered_item != pad_value]
+            gathered_item = gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+            # reconvert if we were passed a tuple of values
+            if numitem > 0:
+                gathered_item = [tuple(g) for g in gathered_item]
+            if lm.rank == 0:
+                vals_torch[(task_name, key, metric)] = gathered_item
+        vals = vals_torch
+    if lm.rank == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
@@ -209,3 +263,6 @@ def evaluate(
                results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
        return {"results": dict(results), "versions": dict(versions)}
+    else:
+        return None
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -8,6 +8,8 @@ import torch.nn.functional as F
 from lm_eval import utils
 from lm_eval.api.model import LM, register_model
+from accelerate import Accelerator
+from itertools import islice
 @register_model("hf-causal", "gpt2")
 class HFLM(LM):
@@ -27,6 +29,8 @@ class HFLM(LM):
        assert isinstance(pretrained, str)
        assert isinstance(batch_size, int)
+        gpus = torch.cuda.device_count()
+        if gpus <= 1:
            if device:  
                if device not in ["cuda", "cpu"]:
                    device = int(device)
@@ -59,10 +63,17 @@ class HFLM(LM):
        # multithreading and batching
        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
-        # TODO: fix multi-gpu
+        if gpus > 1:
-        # gpus = torch.cuda.device_count()
+            accelerator = Accelerator(device_placement=False)
-        # if gpus > 1:
+            self.gpt2 = accelerator.prepare(self.gpt2)
-        #     self.gpt2 = nn.DataParallel(self.gpt2)
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                print(f"Using {gpus} GPUs with FullyShardedDataParalell and accelerate")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = gpus
    @property
    def eot_token_id(self):
@@ -91,6 +102,14 @@ class HFLM(LM):
        # TODO: fix multi-gpu
        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)

--- a/main.py
+++ b/main.py
@@ -89,6 +89,7 @@ def main():
    print(f"Selected Tasks: {task_names}")
+    if results is not None:
        results = evaluator.simple_evaluate(
            model=args.model,
            model_args=args.model_args,