Merge pull request #488 from fattorib/multigpu-feature

Data Parallelism

Merge pull request #488 from fattorib/multigpu-feature
Data Parallelism
62156100 · Lintang Sutawika · GitHub · 0375b792 · d924ca33 · 62156100
Unverified Commit 62156100 authored May 23, 2023 by Lintang Sutawika Committed by GitHub May 23, 2023
7 changed files
--- a/README.md
+++ b/README.md
@@ -104,6 +104,19 @@ python write_out.py \

 This will write out one text file for each task.

+## Multi-GPU Evaluation
+
+Multi-GPU evaluation is supported through [accelerate](https://github.com/huggingface/accelerate). To initialize the distributed environment, run ```accelerate config``` in terminal and follow the prompts. Once the environment is configured, evaluations can be launched with:
+
+```bash
+accelerate launch main.py \
+    --model hf-causal \
+    --tasks lambada_openai,arc_easy \
+    --batch_size 16 \
+```
+
+**Warning**: Distributed evaluation requires launching multiple processes of the evaluation script. Running ```python main.py *args*``` instead of ```accelerate launch main.py *args*``` on machine with multiple GPUs will only run the evaluations on a single device.
+
 ## Implementing new tasks

 To implement a new task in the eval harness, see [this guide](./docs/task_guide.md).

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -285,7 +285,7 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass

-    def build_all_requests(self, limit=None):
+    def build_all_requests(self, limit=None, rank=None, world_size=None):
        """Build a set of Instances for a task, and store them in task.instances"""
        if self.has_test_docs():
            docs = self.test_docs()
@@ -297,10 +297,10 @@ class Task(abc.ABC):
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        instances = []
-        for doc_id, doc in enumerate(
-            itertools.islice(docs, 0, limit) if limit else docs
+        for doc_id, doc in utils.create_iterator(
+            enumerate(docs), rank, world_size, limit
        ):
-            # sample fewshot context
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
            fewshot_ctx = self.fewshot_context(
                doc, self._config.num_fewshot, rnd=random.Random()
            )

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -2,6 +2,8 @@ import random
 import itertools
 import collections

+import torch
+
 import numpy as np

 import lm_eval.api
@@ -14,6 +16,7 @@ from lm_eval.utils import (
    positional_deprecated,
    run_task_tests,
    make_table,
+    create_iterator,
    get_git_commit_hash,
 )

@@ -89,20 +92,22 @@ def simple_evaluate(
        decontamination_ngrams_path=decontamination_ngrams_path,
    )

-    # add info about the model and few shot config
-    results["config"] = {
-        "model": model,
-        "model_args": model_args,
-        "num_fewshot": num_fewshot,
-        "batch_size": batch_size,
-        "device": device,
-        "no_cache": no_cache,
-        "limit": limit,
-        "bootstrap_iters": bootstrap_iters,
-    }
-    results["git_hash"] = get_git_commit_hash()
-
-    return results
+    if lm.rank == 0:
+        # add info about the model and few shot config
+        results["config"] = {
+            "model": model,
+            "model_args": model_args,
+            "num_fewshot": num_fewshot,
+            "batch_size": batch_size,
+            "device": device,
+            "no_cache": no_cache,
+            "limit": limit,
+            "bootstrap_iters": bootstrap_iters,
+        }
+        results["git_hash"] = get_git_commit_hash()
+        return results
+    else:
+        return None


 decontaminate_suffix = "_decontaminate"
@@ -152,8 +157,8 @@ def evaluate(
        # rnd.seed(42)
        # rnd.shuffle(task_docs)

-        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
-        task.build_all_requests(limit=limit)
+        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)
+
        # aggregate Instances by LM method requested to get output.
        reqtype = (
            "loglikelihood"
@@ -162,6 +167,15 @@ def evaluate(
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)

+        if lm.world_size > 1:
+            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
+            gathered_item = (
+                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
+            )
+
+            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
+            numpad = max(gathered_item) - gathered_item[lm.rank]
+
    ### Run LM on inputs, get all outputs ###
    # execute each type of request
    for reqtype, reqs in requests.items():
@@ -171,6 +185,10 @@ def evaluate(
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)

+        if (lm.world_size > 1) and (numpad > 0):
+            for _ in range(numpad):
+                cloned_reqs.extend([req] * req.repeats)
+
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

@@ -178,6 +196,9 @@ def evaluate(
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

+    if lm.world_size > 1:
+        lm.accelerator.wait_for_everyone()
+
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
@@ -192,11 +213,16 @@ def evaluate(
        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
        # TODO: make it possible to use a different metric per key
        for key in task.instances[0].filtered_resps.keys():
-            for doc_id, doc in enumerate(
-                itertools.islice(task.test_docs(), 0, limit)
+            doc_iterator = (
+                itertools.islice(
+                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
+                )
                if task.has_test_docs()
-                else task.validation_docs()
-            ):
+                else itertools.islice(
+                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
+                )
+            )
+            for doc_id, doc in doc_iterator:
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
                requests.sort(key=lambda x: x.idx)
@@ -206,25 +232,68 @@ def evaluate(
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

-    ### Aggregate results over all datapoints ###
-    # aggregate results ; run bootstrap CIs
-    for (task_name, key, metric), items in vals.items():
-        task = task_dict[task_name]
-        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](
-            items
-        )
-
-        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
-        # so we run them less iterations. still looking for a cleaner way to do this
-
-        stderr = lm_eval.api.metrics.stderr_for_metric(
-            metric=task.aggregation()[metric],
-            bootstrap_iters=min(bootstrap_iters, 1000)
-            if metric in ["bleu", "chrf", "ter"]
-            else bootstrap_iters,
-        )
+    if lm.world_size > 1:
+        # if multigpu, then gather data across all ranks
+        vals_torch = collections.defaultdict(list)
+        for (task_name, key, metric), items in vals.items():
+
+            numitem = 0
+            if type(items[0]) == tuple:
+                numitem = len(items[0])
+
+            # distributed gather requires all ranks to have same dimensions
+            # so we pad out with float32 min value
+            pad_value = torch.finfo(torch.float32).min
+            metrics_tensor = torch.tensor(items, device=lm.device)
+
+            original_dtype = metrics_tensor.dtype  # store original dtype
+            torch_device_tensor = lm.accelerator.pad_across_processes(
+                metrics_tensor.to(torch.float32), pad_index=pad_value
+            )
+            gathered_item = lm.accelerator.gather(torch_device_tensor)
+
+            if numitem > 0:
+                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
+            else:
+                gathered_filtered = gathered_item[gathered_item != pad_value]
+
+            gathered_item = (
+                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+            )
+            # reconvert if we were passed a tuple of values
+            if numitem > 0:
+                gathered_item = [tuple(g) for g in gathered_item]
+
+            if lm.rank == 0:
+                vals_torch[(task_name, key, metric)] = gathered_item
+
+        vals = vals_torch
+
+    if lm.rank == 0:
+        ### Aggregate results over all datapoints ###
+        # aggregate results ; run bootstrap CIs
+        for (task_name, key, metric), items in vals.items():
+            task = task_dict[task_name]
+            results[task_name][metric + " - filter=" + key] = task.aggregation()[
+                metric
+            ](items)
+
+            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
+            # so we run them less iterations. still looking for a cleaner way to do this
+
+            stderr = lm_eval.api.metrics.stderr_for_metric(
+                metric=task.aggregation()[metric],
+                bootstrap_iters=min(bootstrap_iters, 1000)
+                if metric in ["bleu", "chrf", "ter"]
+                else bootstrap_iters,
+            )
+
+            if stderr is not None:
+                results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(
+                    items
+                )

-        if stderr is not None:
-            results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
+        return {"results": dict(results), "versions": dict(versions)}

-    return {"results": dict(results), "versions": dict(versions)}
+    else:
+        return None
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -9,6 +9,9 @@ from lm_eval import utils
 from lm_eval.logger import eval_logger
 from lm_eval.api.model import LM, register_model

+from accelerate import Accelerator
+from itertools import islice
+

 @register_model("hf-causal", "gpt2")
 class HFLM(LM):
@@ -28,19 +31,26 @@ class HFLM(LM):
        assert isinstance(pretrained, str)
        assert isinstance(batch_size, int)

-        if device:
-            if device not in ["cuda", "cpu"]:
-                device = int(device)
-            self._device = torch.device(device)
-            eval_logger.info(f"Using device '{device}'")
+        gpus = torch.cuda.device_count()
+        if gpus <= 1:
+            if device:
+                if device not in ["cuda", "cpu"]:
+                    device = int(device)
+                self._device = torch.device(device)
+                print(f"Using device '{device}'")
+            else:
+                print("Device not specified")
+                print(f"Cuda Available? {torch.cuda.is_available()}")
+                self._device = (
+                    torch.device("cuda")
+                    if torch.cuda.is_available()
+                    else torch.device("cpu")
+                )
+            self._rank = 0
+            self._world_size = 1
+
        else:
-            eval_logger.warning("Device not specified")
-            eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
-            self._device = (
-                torch.device("cuda")
-                if torch.cuda.is_available()
-                else torch.device("cpu")
-            )
+            self._device = "cpu"

        # TODO: update this to be less of a hack once subfolder is fixed in HF
        revision = revision + ("/" + subfolder if subfolder is not None else "")
@@ -60,10 +70,30 @@ class HFLM(LM):
        # multithreading and batching
        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size

-        # TODO: fix multi-gpu
-        # gpus = torch.cuda.device_count()
-        # if gpus > 1:
-        #     self.gpt2 = nn.DataParallel(self.gpt2)
+        # multigpu support with accelerate
+        if gpus > 1:
+            accelerator = Accelerator()
+            if gpus > accelerator.num_processes:
+                warning = (
+                    "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                    "If you would like to use data parallelism, please launch the script "
+                    "with 'accelerate launch *script*'. "
+                    f"Current run will proceed with {accelerator.num_processes} devices."
+                )
+                print(warning)
+                self._rank = accelerator.local_process_index
+                self._world_size = accelerator.num_processes
+            
+            else:
+                self.gpt2 = accelerator.prepare(self.gpt2)
+                self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+                self.accelerator = accelerator
+
+                if self.accelerator.is_local_main_process:
+                    print(f"Using {gpus} devices with data parallelism")
+
+                self._rank = self.accelerator.local_process_index
+                self._world_size = self.accelerator.num_processes

    @property
    def eot_token_id(self):
@@ -73,10 +103,18 @@ class HFLM(LM):
    @property
    def max_length(self):
        try:
-            return self.gpt2.config.n_ctx
+            if hasattr(self, "accelerator"):
+                return self.accelerator.unwrap_model(self.gpt2).config.n_ctx
+            else:
+                return self.gpt2.config.n_ctx
        except AttributeError:
            # gptneoconfig doesn't have n_ctx apparently
-            return self.gpt2.config.max_position_embeddings
+            if hasattr(self, "accelerator"):
+                return self.accelerator.unwrap_model(
+                    self.gpt2
+                ).config.max_position_embeddings
+            else:
+                return self.gpt2.config.max_position_embeddings

    @property
    def max_gen_toks(self):
@@ -84,14 +122,20 @@ class HFLM(LM):

    @property
    def batch_size(self):
-        # TODO: fix multi-gpu
-        return self.batch_size_per_gpu  # * gpus
+        return self.batch_size_per_gpu

    @property
    def device(self):
-        # TODO: fix multi-gpu
        return self._device

+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)

@@ -138,7 +182,7 @@ class HFLM(LM):
        # TODO: automatic batch size detection for vectorization

        loglikelihoods = []
-        for (string,) in tqdm([req.args for req in requests]):
+        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -155,12 +199,28 @@ class HFLM(LM):

            # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
            # that
+
+            pad_amnt = 0
+            if self.world_size > 1:
+                # TODO: Comment on what we do here
+                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
+                gathered = (
+                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+                )
+
+                pad_amnt = max(gathered) - gathered[self.rank]
+                if pad_amnt > 0:
+                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+
            string_nll = self._loglikelihood_tokens(
                rolling_token_windows, disable_tqdm=True
            )

-            # discard is_greedy
-            string_nll = [x[0] for x in string_nll]
+            if (self.world_size > 1) and (pad_amnt > 0):
+                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+            else:
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]

            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)
@@ -185,8 +245,10 @@ class HFLM(LM):
        # TODO: automatic (variable) batch size detection for vectorization
        re_ord = utils.Reorderer(requests, _collate)
        for chunk in utils.chunks(
-            tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
+            tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
+            self.batch_size,
        ):
+
            inps = []
            cont_toks_list = []
            inplens = []

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -13,6 +13,7 @@ from typing import List

 from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
+from itertools import islice


 class ExitCodeError(Exception):
@@ -317,3 +318,12 @@ env = Environment(loader=BaseLoader, undefined=StrictUndefined)
 def apply_template(template, doc):
    rtemplate = env.from_string(template)
    return rtemplate.render(**doc)
+
+
+def create_iterator(raw_iterator, rank, world_size, limit=None):
+    """
+    Method for creating a (potentially) sliced and limited
+    iterator from a raw document iterator. Used for splitting data
+    among ranks in multigpu setting or only pulling a sample of documents
+    """
+    return islice(raw_iterator, rank, limit, world_size)
--- a/main.py
+++ b/main.py
@@ -54,7 +54,7 @@ def pattern_match(patterns, source_list):
    for pattern in patterns:
        for matching in fnmatch.filter(source_list, pattern):
            task_names.add(matching)
-    return list(task_names)
+    return sorted(list(task_names))


 def main():
@@ -96,19 +96,19 @@ def main():
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
    )
+    if results is not None:
+        dumped = json.dumps(results, indent=2)
+        print(dumped)

-    dumped = json.dumps(results, indent=2)
-    print(dumped)
+        if args.output_path:
+            with open(args.output_path, "w") as f:
+                f.write(dumped)

-    if args.output_path:
-        with open(args.output_path, "w") as f:
-            f.write(dumped)
-
-    print(
-        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
-        f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
-    )
-    print(evaluator.make_table(results))
+        print(
+            f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
+            f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+        )
+        print(evaluator.make_table(results))


 if __name__ == "__main__":

--- a/setup.py
+++ b/setup.py
@@ -21,6 +21,7 @@ setuptools.setup(
    ],
    python_requires=">=3.6",
    install_requires=[
+        "accelerate>=0.18.0",
        "datasets>=2.0.0",
        "jsonlines",
        "numexpr",