Vllm update DP+TP (#1508)

* use `@ray.remote` with distributed vLLM * update versions * bugfix * unpin vllm * fix pre-commit * added version assertion error * Revert "added version assertion error" This reverts commit 8041e9b78e95eea9f4f4d0dc260115ba8698e9cc. * added version assertion for DP * expand DP note * add warning * nit * pin vllm * fix typos

Vllm update DP+TP (#1508)
* use `@ray.remote` with distributed vLLM * update versions * bugfix * unpin vllm * fix pre-commit * added version assertion error * Revert "added version assertion error" This reverts commit 8041e9b78e95eea9f4f4d0dc260115ba8698e9cc. * added version assertion for DP * expand DP note * add warning * nit * pin vllm * fix typos
e5e35fca · Baber Abbasi · GitHub · ae79b121 · e5e35fca · e5e35fca
Unverified Commit e5e35fca authored Mar 03, 2024 by Baber Abbasi Committed by GitHub Mar 03, 2024
5 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 exclude: ^tests/testdata/
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v4.5.0
    hooks:
      - id: check-added-large-files
      - id: check-ast
@@ -29,7 +29,7 @@ repos:
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
-    rev: v0.1.8
+    rev: v0.2.2
    hooks:
      # Run the linter.
      - id: ruff
@@ -38,7 +38,7 @@ repos:
        # Run the formatter.
      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
-    rev: v2.1.0
+    rev: v2.2.6
    hooks:
      - id: codespell
        exclude: >

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -887,7 +887,7 @@ class HFLM(TemplateLM):
        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
            """Defines the key to group and lookup one-token continuations"""
            # Use with group_by="contexts" (optional)"
-            # allows for the creation of a lookup, so we can re-use logits in case of one-token continuations.
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
            # speeds up some multiple-choice tasks proportionally to the number of choices.
            # groups requests by context+continuation[:-1] and infer on one request/group.
            return req[-2] + req[-1][:-1]

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
+from importlib.metadata import version
 from importlib.util import find_spec
 from typing import List, Literal, Optional, Tuple, Union

 from more_itertools import distribute
+from packaging.version import parse as parse_version
 from tqdm import tqdm

 from lm_eval.api.instance import Instance
@@ -18,7 +20,6 @@ from lm_eval.utils import (

 try:
    import ray
-    from ray.util.multiprocessing import Pool
    from vllm import LLM, SamplingParams
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
@@ -27,14 +28,6 @@ except ModuleNotFoundError:
 eval_logger = eval_logger


-# adapted from https://github.com/vllm-project/vllm/issues/367#issuecomment-1788341727
-def run_inference_one_model(
-    model_args: dict, sampling_params, requests: List[List[int]]
-):
-    llm = LLM(**model_args)
-    return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
-
-
 @register_model("vllm")
 class VLLM(TemplateLM):
    _DEFAULT_MAX_LENGTH = 2048
@@ -61,6 +54,7 @@ class VLLM(TemplateLM):
        gpu_memory_utilization: float = 0.9,
        device: str = "cuda",
        data_parallel_size: int = 1,
+        **kwargs,
    ):
        super().__init__()

@@ -93,6 +87,7 @@ class VLLM(TemplateLM):
            "quantization": quantization,
            "seed": int(seed),
        }
+        self.model_args.update(kwargs)
        self.batch_size = (
            "auto"
            if isinstance(batch_size, str) and "auto" in batch_size
@@ -101,6 +96,12 @@ class VLLM(TemplateLM):
        if self.data_parallel_size <= 1:
            self.model = LLM(**self.model_args)
        else:
+            assert parse_version(version("vllm")) < parse_version(
+                "0.3.3"
+            ), "data_parallel is only compatible with vllm < v0.3.3."
+            eval_logger.warning(
+                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
+            )
            self.model_args["worker_use_ray"] = True
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")
@@ -182,13 +183,26 @@ class VLLM(TemplateLM):
                temperature=0, prompt_logprobs=1, max_tokens=1
            )
        if self.data_parallel_size > 1:
+            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # also seems to only work with decorator and not with ray.remote() fn
+            # see https://github.com/vllm-project/vllm/issues/973
+            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
+            # but then tensor_parallel breaks
+            @ray.remote
+            def run_inference_one_model(
+                model_args: dict, sampling_params, requests: List[List[int]]
+            ):
+                llm = LLM(**model_args)
+                return llm.generate(
+                    prompt_token_ids=requests, sampling_params=sampling_params
+                )
+
            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
            # interleaved important to balance context lengths across workers
            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
-            inputs = [(self.model_args, sampling_params, req) for req in requests]
-
-            with Pool(self.data_parallel_size) as pool:
-                results = pool.starmap(run_inference_one_model, inputs)
+            inputs = ((self.model_args, sampling_params, req) for req in requests)
+            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
+            results = ray.get(object_refs)
            # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
            ray.shutdown()
            # flatten results
@@ -286,7 +300,7 @@ class VLLM(TemplateLM):
                    f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}"
                )
            # add EOS token to stop sequences
-            eos = self.tok_decode(self.eot_token_id)
+            eos = self.tokenizer.decode(self.eot_token_id)
            if not until:
                until = [eos]
            else:

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,7 +71,7 @@ optimum = ["optimum[openvino]"]
 promptsource = ["promptsource>=0.2.3"]
 sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
-vllm = ["vllm<=0.2.5"]
+vllm = ["vllm==0.3.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 all = [

--- a/scripts/clean_training_data/janitor_util.cpp
+++ b/scripts/clean_training_data/janitor_util.cpp
@@ -75,7 +75,7 @@ std::vector<std::string> clean_ngram(std::string const &input,
          gram_lengths.erase(gram_lengths.begin());
          gram_lengths.push_back(0);

-          // Otherwise, continute building
+          // Otherwise, continue building
        } else {
          current_ngram += ' ';
          gram_lengths.push_back(0);
@@ -165,7 +165,7 @@ clean_ngram_with_indices(std::string const &input, std::string const &ignore,
          gram_start_indices.erase(gram_start_indices.begin());
          gram_start_indices.push_back(i + 1);

-          // Otherwise, continute building
+          // Otherwise, continue building
        } else {
          current_ngram += ' ';
          gram_lengths.push_back(0);